def separate_dataset(self, in_file, out_file, check_function): """ This function is used to separate the original word similarity dataset. word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt, graph_ws353-sim.txt, graph_simlex.txt both words are in knowledge graph: type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt, type_simlex.txt :param in_file: source dataset file :param out_file: target dataset file :param check_function: the function of mapping criteria for deciding the word pairs. :return: """ out_data = [] word_pairs, human = self.load_dataset(in_file) for i, pairs in enumerate(word_pairs): w1, w2 = pairs h = human[i] if check_function(w1, w2): out_data.append(' '.join([w1, w2, str(h)])) FileIO.save_list_file('dataset/wordsim/%s.txt' % out_file, out_data)
def separate_dataset(self, in_file, out_file, check_function): """ This function is used to separate the original word similarity dataset. word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt, graph_ws353-sim.txt, graph_simlex.txt both words are in knowledge graph: type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt, type_simlex.txt :param in_file: source dataset file :param out_file: target dataset file :param check_function: the function of mapping criteria for deciding the word pairs. :return: """ out_data = [] word_pairs, human = self.load_dataset(in_file) for i, pairs in enumerate(word_pairs): w1, w2 = pairs h = human[i] if check_function(w1, w2): out_data.append(' '.join([w1, w2, str(h)])) FileIO.save_list_file('eval/word_similarity/%s.txt' % out_file, out_data)
def __init__(self, DICT='models/abstract/abstracts.dict', TFIDF_MODEL='models/abstract/abstracts_tfidf.model', LSA_MODEL='models/abstract/abstracts_lsi.model'): try: from nltk.tokenize import RegexpTokenizer from nltk.stem import WordNetLemmatizer import nltk self._tokenizer = RegexpTokenizer(r'[a-z]+') self._lemma = WordNetLemmatizer() self._stopwords = set(nltk.corpus.stopwords.words('english')) except: print('Install NLTK and download WordNet!') import sys sys.exit() try: from gensim import corpora, models from sematch.utility import FileIO self._dict = corpora.Dictionary.load(FileIO.filename(DICT)) self._tfidf = models.TfidfModel.load(FileIO.filename(TFIDF_MODEL)) self._lsa = models.LsiModel.load(FileIO.filename(LSA_MODEL)) except: print('Install gensim and prepare models data!') import sys sys.exit()
def graph_ic_writer(self, filename, data): """ Save the ic values for a concept for faster access. :param filename: :param data: :return: """ FileIO.append_json_file(filename, data)
def test_embedding(): from gensim.models import KeyedVectors from sematch.utility import FileIO from sematch.semantic.relatedness import WordRelatedness model_wiki = KeyedVectors.load_word2vec_format(FileIO.filename('models/w2v-model-enwiki_w2vformat'), binary=True) model_news = KeyedVectors.load_word2vec_format(FileIO.filename('models/googlenews.bin'), binary=True) rel = WordRelatedness(model_news) print(rel.word_similarity('happy','sad'))
def test_evaluation(): from sematch.utility import FileIO query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') question = FileIO.read_json_file('dataset/ned/question_ned_cleaned.txt') tweet = FileIO.read_json_file('dataset/ned/tweet_ned_cleaned.txt') print len(query) print len(question) print len(tweet)
def test_entity_feature(): from sematch.utility import FileIO from sematch.nlp import EntityFeature query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') question = FileIO.read_json_file('dataset/ned/question_ned_cleaned.txt') tweet = FileIO.read_json_file('dataset/ned/tweet_ned_cleaned.txt') import itertools candidates = list(itertools.chain.from_iterable(map(lambda x:x['candidate'], question))) set_candidates = list(set(candidates)) print len(set_candidates) EntityFeature.candidate_features(set_candidates, export_file='models/question_features.json')
def save_result(self, cor, sim_values, sim_name, dataset_name): """ This function save the result computed by a similarity metric :param cor: correlation with human rating :param sim_values: similarity scores for word pairs :param sim_name: the name of similarity metric :param dataset_name: the name of word similarity dataset :return: """ data = ["%.3f" % cor] data += map(lambda x: "%.3f" % x, sim_values) FileIO.save_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name), data)
def save_result(self, cor, sim_values, sim_name, dataset_name): """ This function save the result computed by a similarity metric :param cor: correlation with human rating :param sim_values: similarity scores for word pairs :param sim_name: the name of similarity metric :param dataset_name: the name of word similarity dataset :return: """ data = ["%.3f" % cor] data += map(lambda x: "%.3f" % x, sim_values) FileIO.save_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name), data)
def evaluate(self, input_file, output_file): """ Evaluate the sentence similarity :param input_file: corpus file :param output_file: result file :return: similarity scores of text pairs """ corpus = self.load_dataset(input_file) print 'dataset size: ', len(corpus) result = [self._sim_metric(t1, t2) for t1, t2 in corpus] result = map(lambda x: "%.3f" % round(x, 3), result) FileIO.save_list_file(output_file, result) return result
def evaluate(self, input_file, output_file): """ Evaluate the sentence similarity :param input_file: corpus file :param output_file: result file :return: similarity scores of text pairs """ corpus = self.load_dataset(input_file) print 'dataset size: ', len(corpus) result = [self._sim_metric(t1, t2) for t1, t2 in corpus] result = map(lambda x:"%.3f" % round(x,3), result) FileIO.save_list_file(output_file, result) return result
def candidate_features(cls, candidates, export_file='models/candidate_features.json', feature_dict_file='models/entity_features.json'): from sematch.utility import FileIO entity_features = FileIO.read_json_file(feature_dict_file) entity_features = {e['dbr']: (e['desc'], e['cat']) for e in entity_features} features = [] for i, can in enumerate(candidates): print i, " ", can data = {} data['dbr'] = can data['desc'] = entity_features[can][0] if can in entity_features else None data['cat'] = entity_features[can][1] if can in entity_features else [] features.append(data) FileIO.save_json_file(export_file, features) return features
def test_category(): from gensim.models.doc2vec import Doc2Vec from sematch.utility import FileIO from sematch.semantic.relatedness import ConceptRelatedness model_category = Doc2Vec.load(FileIO.filename('models/category/cat2vec')) cat2vec_rel = ConceptRelatedness(model_category) print(cat2vec_rel.word_similarity('happy','sad'))
def __init__(self, vec_file='models/GoogleNews-vectors-negative300.bin', binary=True): """ :param vec_file: the file storing vectors :param binary: if vector are stored in binary. Google news use binary while yelp not """ self._wordvec = Word2Vec.load_word2vec_format(FileIO.filename(vec_file), binary=binary)
def graph_ic_reader(self, filename): """ Load the saved IC values :param filename: the file containing IC values of concepts :return: a dictionary concept:IC """ data = FileIO.read_json_file(filename) return {d['concept']:float(d['ic']) for d in data}
def load(cls, feature_dict_file='models/entity_features.json'): from sematch.utility import FileIO entity_features = FileIO.read_json_file(feature_dict_file) entity_features = { e['dbr']: (e['desc'], e['cat']) for e in entity_features } return cls(entity_features)
def graph_ic_reader(self, filename): """ Load the saved IC values :param filename: the file containing IC values of concepts :return: a dictionary concept:IC """ data = FileIO.read_json_file(filename) return {d['concept']: float(d['ic']) for d in data}
def __init__(self, src='models/dbpedia_2015-04.owl'): self.graph = rdflib.Graph() self.graph.parse(FileIO.filename(src)) self.root = 'http://www.w3.org/2002/07/owl#Thing' self.classes = [s for s in self.graph.subjects(RDF.type, OWL.Class)] self.o_properties = [s for s in self.graph.subjects(RDF.type, OWL.ObjectProperty)] self.d_properties = [s for s in self.graph.subjects(RDF.type, OWL.DatatypeProperty)] self.uri2class = {c.toPython():c for c in self.classes} self.uri2class[self.root] = rdflib.URIRef(self.root) self.class_labels = [self.token(c) for c in self.classes]
def __init__(self, vec_file='models/GoogleNews-vectors-negative300.bin', binary=True): """ :param vec_file: the file storing vectors :param binary: if vector are stored in binary. Google news use binary while yelp not """ self._wordvec = Word2Vec.load_word2vec_format( FileIO.filename(vec_file), binary=binary)
def __init__(self, graph_ic='models/yago_type_ic.txt', mappings="models/type-linkings.txt"): WordNetSimilarity.__init__(self) self._graph_ic = GraphIC(graph_ic) self._mappings = FileIO.read_json_file(mappings) self._id2mappings = {data['offset']: data for data in self._mappings} self._yago2id = { data['yago_dbpedia']: data['offset'] for data in self._mappings }
def load_result(self, sim_name, dataset_name): """ This function loads the result of a similarity metric for a specific dataset :param sim_name: the name similarity metric :param dataset_name: the name of word similarity dataset :return: cor relation score and rating scores generated by similarity metric """ data = FileIO.read_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name)) data = list(map(float, data)) return data[0], data[1:]
def load_result(self, sim_name, dataset_name): """ This function loads the result of a similarity metric for a specific dataset :param sim_name: the name similarity metric :param dataset_name: the name of word similarity dataset :return: cor relation score and rating scores generated by similarity metric """ data = FileIO.read_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name)) data = map(float, data) return data[0], data[1:]
def load_dataset(self, dataset_name): """ This function loads the word similarity dataset :param dataset_name: the file name of word similarity dataset :return: word pairs and huamn ratings """ data = FileIO.read_list_file('dataset/wordsim/%s.txt' % dataset_name) #print "dataset ", dataset_name, " ", len(data), " word pairs" word_pairs = map(lambda x: (x.split()[0], x.split()[1]), data) human = list(map(float, map(lambda x: x.split()[2], data))) return word_pairs, human
def load_dataset(self, dataset_name): """ This function loads the word similarity dataset :param dataset_name: the file name of word similarity dataset :return: word pairs and huamn ratings """ data = FileIO.read_list_file('eval/word_similarity/%s.txt' % dataset_name) #print "dataset ", dataset_name, " ", len(data), " word pairs" word_pairs = map(lambda x: (x.split()[0], x.split()[1]), data) human = map(float, map(lambda x: x.split()[2], data)) return word_pairs, human
def load_dataset(self, dataset_file, cat_full=False): from BeautifulSoup import BeautifulSOAP as bs pairs = [] with open(FileIO.filename(dataset_file), 'r') as f: corpus = f.read() opinions = bs(corpus).findAll('opinion') for op in opinions: if not op['target'] == 'NULL': t = op['target'] c = op['category'] if cat_full else op['category'].split('#')[0] pairs.append((t, c)) X, y = zip(*pairs) return X, y
def candidate_features(cls, candidates, export_file='models/candidate_features.json', feature_dict_file='models/entity_features.json'): from sematch.utility import FileIO entity_features = FileIO.read_json_file(feature_dict_file) entity_features = { e['dbr']: (e['desc'], e['cat']) for e in entity_features } features = [] for i, can in enumerate(candidates): print i, " ", can data = {} data['dbr'] = can data['desc'] = entity_features[can][ 0] if can in entity_features else None data['cat'] = entity_features[can][ 1] if can in entity_features else [] features.append(data) FileIO.save_json_file(export_file, features) return features
def load_dataset(self, dataset_file): """ Generate sentence pairs. :param dataset_file: dataset file :return: sentence pairs """ data = FileIO.read_list_file(dataset_file) data = [d.strip() for d in data] corpus = [] for d in data: item = d.split('\t') corpus.append((item[0], item[1])) return corpus
def test_feature_extractor(): from sematch.nlp import FeatureExtractor from sematch.nlp import EntityFeature from sematch.nlp import SpaCyNLP from sematch.utility import FileIO import itertools sy = SpaCyNLP() w_extractor = FeatureExtractor(sy.pos_tag) features = EntityFeature.load(feature_dict_file='models/query_features.json') query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') candidates = list(itertools.chain.from_iterable(map(lambda x: x['candidate'], query))) set_candidates = list(set(candidates)) for can in set_candidates[:10]: print w_extractor.entity_word_features([can], features)
def __init__(self, src='models/dbpedia_2015-04.owl'): self.graph = rdflib.Graph() self.graph.parse(FileIO.filename(src)) self.root = 'http://www.w3.org/2002/07/owl#Thing' self.classes = [s for s in self.graph.subjects(RDF.type, OWL.Class)] self.o_properties = [ s for s in self.graph.subjects(RDF.type, OWL.ObjectProperty) ] self.d_properties = [ s for s in self.graph.subjects(RDF.type, OWL.DatatypeProperty) ] self.uri2class = {c.toPython(): c for c in self.classes} self.uri2class[self.root] = rdflib.URIRef(self.root) self.class_labels = [self.token(c) for c in self.classes]
def load_dataset(self, dataset_file, cat_full=False): from BeautifulSoup import BeautifulSOAP as bs pairs = [] with open(FileIO.filename(dataset_file), 'r') as f: corpus = f.read() opinions = bs(corpus).findAll('opinion') for op in opinions: if not op['target'] == 'NULL': t = op['target'] c = op['category'] if cat_full else op['category'].split( '#')[0] pairs.append((t, c)) X, y = zip(*pairs) return X, y
def test_query_ned(): from sematch.nlp import FeatureExtractor from sematch.nlp import EntityFeature from sematch.nlp import SpaCyNLP from sematch.utility import FileIO from sematch.semantic.relatedness import TextRelatedness from sematch.nel import EntityDisambiguation import itertools sy = SpaCyNLP() features = EntityFeature.load( feature_dict_file='models/query_features.json') extractor = FeatureExtractor(features, sy.pos_tag) ned = EntityDisambiguation(extractor) rel = TextRelatedness() from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() #print wns.word_similarity('cooling', 'air_conditioner', 'li') #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa') query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') query = [q for q in query if extractor.context_features(q['query'])] print len(query) import warnings warnings.filterwarnings("ignore") metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath'] for m in metrics: print m similarity = lambda x, y: wns.word_similarity(x, y, m) for k in range(1, 21): gold = [] predict = [] for q in query: gold.append(q['gold']) #e = ned.text_disambiguate(q['query'], q['candidate'], similarity) e = ned.word_disambiguate(q['query'], q['candidate'], similarity, K=k) predict.append(e) from sklearn.metrics import precision_recall_fscore_support #from sklearn.metrics import classification_report #print classification_report(gold, predict) print precision_recall_fscore_support(gold, predict, average='weighted')[2]
def test_query_ned(): from sematch.nlp import FeatureExtractor from sematch.nlp import EntityFeature from sematch.nlp import SpaCyNLP from sematch.utility import FileIO from sematch.semantic.relatedness import TextRelatedness from sematch.nel import EntityDisambiguation import itertools sy = SpaCyNLP() features = EntityFeature.load(feature_dict_file='models/query_features.json') extractor = FeatureExtractor(features, sy.pos_tag) ned = EntityDisambiguation(extractor) rel = TextRelatedness() from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() #print wns.word_similarity('cooling', 'air_conditioner', 'li') #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa') query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') query = [q for q in query if extractor.context_features(q['query'])] print len(query) import warnings warnings.filterwarnings("ignore") metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath'] for m in metrics: print m similarity = lambda x, y: wns.word_similarity(x, y, m) for k in range(1, 21): gold = [] predict = [] for q in query: gold.append(q['gold']) #e = ned.text_disambiguate(q['query'], q['candidate'], similarity) e = ned.word_disambiguate(q['query'], q['candidate'], similarity, K=k) predict.append(e) from sklearn.metrics import precision_recall_fscore_support #from sklearn.metrics import classification_report #print classification_report(gold, predict) print precision_recall_fscore_support(gold, predict, average='weighted')[2]
def load(cls, name_dict_file='models/name.dict'): from sematch.utility import FileIO name = FileIO.read_json_file(name_dict_file) name = {n['name']: n['concepts'] for n in name} return cls(name)
def load(cls, feature_dict_file='models/entity_features.json'): from sematch.utility import FileIO entity_features = FileIO.read_json_file(feature_dict_file) entity_features = {e['dbr']: (e['desc'], e['cat']) for e in entity_features} return cls(entity_features)
def load_stopwords(self, filename): data = FileIO.read_list_file(FileIO.filename(filename)) data = [d.split() for d in data[1:]] # skip first line, in case more than one word per line data = list(itertools.chain.from_iterable(data)) return data
def __init__(self, graph_ic='models/yago_type_ic.txt', mappings="models/type-linkings.txt"): WordNetSimilarity.__init__(self) self._graph_ic = GraphIC(graph_ic) self._mappings = FileIO.read_json_file(mappings) self._id2mappings = {data['offset']: data for data in self._mappings} self._yago2id = {data['yago_dbpedia']: data['offset'] for data in self._mappings}
def load_dataset(self): data = FileIO.read_json_file('dataset/aspect/data.txt') X, y = zip(*[(d['text'], d['label']) for d in data]) return X, y