예제 #1
0
    def separate_dataset(self, in_file, out_file, check_function):
        """
        This function is used to separate the original word similarity dataset.

        word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt

        the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt,
        graph_ws353-sim.txt, graph_simlex.txt

        both words are in knowledge graph:  type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt,
        type_simlex.txt

        :param in_file: source dataset file
        :param out_file: target dataset file
        :param check_function: the function of mapping criteria for deciding the word pairs.
        :return:
        """
        out_data = []
        word_pairs, human = self.load_dataset(in_file)
        for i, pairs in enumerate(word_pairs):
            w1, w2 = pairs
            h = human[i]
            if check_function(w1, w2):
                out_data.append(' '.join([w1, w2, str(h)]))
        FileIO.save_list_file('dataset/wordsim/%s.txt' % out_file, out_data)
예제 #2
0
    def separate_dataset(self, in_file, out_file, check_function):
        """
        This function is used to separate the original word similarity dataset.

        word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt

        the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt,
        graph_ws353-sim.txt, graph_simlex.txt

        both words are in knowledge graph:  type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt,
        type_simlex.txt

        :param in_file: source dataset file
        :param out_file: target dataset file
        :param check_function: the function of mapping criteria for deciding the word pairs.
        :return:
        """
        out_data = []
        word_pairs, human = self.load_dataset(in_file)
        for i, pairs in enumerate(word_pairs):
            w1, w2 = pairs
            h = human[i]
            if check_function(w1, w2):
                out_data.append(' '.join([w1, w2, str(h)]))
        FileIO.save_list_file('eval/word_similarity/%s.txt' % out_file, out_data)
예제 #3
0
 def __init__(self,
              DICT='models/abstract/abstracts.dict',
              TFIDF_MODEL='models/abstract/abstracts_tfidf.model',
              LSA_MODEL='models/abstract/abstracts_lsi.model'):
     try:
         from nltk.tokenize import RegexpTokenizer
         from nltk.stem import WordNetLemmatizer
         import nltk
         self._tokenizer = RegexpTokenizer(r'[a-z]+')
         self._lemma = WordNetLemmatizer()
         self._stopwords = set(nltk.corpus.stopwords.words('english'))
     except:
         print('Install NLTK and download WordNet!')
         import sys
         sys.exit()
     try:
         from gensim import corpora, models
         from sematch.utility import FileIO
         self._dict = corpora.Dictionary.load(FileIO.filename(DICT))
         self._tfidf = models.TfidfModel.load(FileIO.filename(TFIDF_MODEL))
         self._lsa = models.LsiModel.load(FileIO.filename(LSA_MODEL))
     except:
         print('Install gensim and prepare models data!')
         import sys
         sys.exit()
예제 #4
0
 def graph_ic_writer(self, filename, data):
     """
     Save the ic values for a concept for faster access.
     :param filename:
     :param data:
     :return:
     """
     FileIO.append_json_file(filename, data)
예제 #5
0
 def graph_ic_writer(self, filename, data):
     """
     Save the ic values for a concept for faster access.
     :param filename:
     :param data:
     :return:
     """
     FileIO.append_json_file(filename, data)
예제 #6
0
def test_embedding():
    from gensim.models import KeyedVectors
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import WordRelatedness
    model_wiki = KeyedVectors.load_word2vec_format(FileIO.filename('models/w2v-model-enwiki_w2vformat'), binary=True)
    model_news = KeyedVectors.load_word2vec_format(FileIO.filename('models/googlenews.bin'), binary=True)
    rel = WordRelatedness(model_news)
    print(rel.word_similarity('happy','sad'))
예제 #7
0
파일: test_ned.py 프로젝트: zflgte/sematch
def test_evaluation():
    from sematch.utility import FileIO
    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    question = FileIO.read_json_file('dataset/ned/question_ned_cleaned.txt')
    tweet = FileIO.read_json_file('dataset/ned/tweet_ned_cleaned.txt')
    print len(query)
    print len(question)
    print len(tweet)
예제 #8
0
파일: test_ned.py 프로젝트: gsi-upm/sematch
def test_evaluation():
    from sematch.utility import FileIO
    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    question = FileIO.read_json_file('dataset/ned/question_ned_cleaned.txt')
    tweet = FileIO.read_json_file('dataset/ned/tweet_ned_cleaned.txt')
    print len(query)
    print len(question)
    print len(tweet)
예제 #9
0
파일: test_nlp.py 프로젝트: zflgte/sematch
def test_entity_feature():
    from sematch.utility import FileIO
    from sematch.nlp import EntityFeature
    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    question = FileIO.read_json_file('dataset/ned/question_ned_cleaned.txt')
    tweet = FileIO.read_json_file('dataset/ned/tweet_ned_cleaned.txt')
    import itertools
    candidates = list(itertools.chain.from_iterable(map(lambda x:x['candidate'], question)))
    set_candidates = list(set(candidates))
    print len(set_candidates)
    EntityFeature.candidate_features(set_candidates, export_file='models/question_features.json')
예제 #10
0
 def save_result(self, cor, sim_values, sim_name, dataset_name):
     """
     This function save the result computed by a similarity metric
     :param cor: correlation with human rating
     :param sim_values: similarity scores for word pairs
     :param sim_name: the name of similarity metric
     :param dataset_name: the name of word similarity dataset
     :return:
     """
     data = ["%.3f" % cor]
     data += map(lambda x: "%.3f" % x, sim_values)
     FileIO.save_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name), data)
예제 #11
0
 def save_result(self, cor, sim_values, sim_name, dataset_name):
     """
     This function save the result computed by a similarity metric
     :param cor: correlation with human rating
     :param sim_values: similarity scores for word pairs
     :param sim_name: the name of similarity metric
     :param dataset_name: the name of word similarity dataset
     :return:
     """
     data = ["%.3f" % cor]
     data += map(lambda x: "%.3f" % x, sim_values)
     FileIO.save_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name), data)
예제 #12
0
 def evaluate(self, input_file, output_file):
     """
     Evaluate the sentence similarity
     :param input_file: corpus file
     :param output_file: result file
     :return: similarity scores of text pairs
     """
     corpus = self.load_dataset(input_file)
     print 'dataset size: ', len(corpus)
     result = [self._sim_metric(t1, t2) for t1, t2 in corpus]
     result = map(lambda x: "%.3f" % round(x, 3), result)
     FileIO.save_list_file(output_file, result)
     return result
예제 #13
0
 def evaluate(self, input_file, output_file):
     """
     Evaluate the sentence similarity
     :param input_file: corpus file
     :param output_file: result file
     :return: similarity scores of text pairs
     """
     corpus = self.load_dataset(input_file)
     print 'dataset size: ', len(corpus)
     result = [self._sim_metric(t1, t2) for t1, t2 in corpus]
     result = map(lambda x:"%.3f" % round(x,3), result)
     FileIO.save_list_file(output_file, result)
     return result
예제 #14
0
파일: nlp.py 프로젝트: gsi-upm/sematch
 def candidate_features(cls, candidates, export_file='models/candidate_features.json',
                        feature_dict_file='models/entity_features.json'):
     from sematch.utility import FileIO
     entity_features = FileIO.read_json_file(feature_dict_file)
     entity_features = {e['dbr']: (e['desc'], e['cat']) for e in entity_features}
     features = []
     for i, can in enumerate(candidates):
         print i, " ", can
         data = {}
         data['dbr'] = can
         data['desc'] = entity_features[can][0] if can in entity_features else None
         data['cat'] = entity_features[can][1] if can in entity_features else []
         features.append(data)
     FileIO.save_json_file(export_file, features)
     return features
예제 #15
0
def test_category():
    from gensim.models.doc2vec import Doc2Vec
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import ConceptRelatedness
    model_category = Doc2Vec.load(FileIO.filename('models/category/cat2vec'))
    cat2vec_rel = ConceptRelatedness(model_category)
    print(cat2vec_rel.word_similarity('happy','sad'))
예제 #16
0
    def __init__(self, vec_file='models/GoogleNews-vectors-negative300.bin', binary=True):
        """

        :param vec_file: the file storing vectors
        :param binary: if vector are stored in binary. Google news use binary while yelp not
        """
        self._wordvec = Word2Vec.load_word2vec_format(FileIO.filename(vec_file), binary=binary)
예제 #17
0
 def graph_ic_reader(self, filename):
     """
     Load the saved IC values
     :param filename: the file containing IC values of concepts
     :return: a dictionary concept:IC
     """
     data = FileIO.read_json_file(filename)
     return {d['concept']:float(d['ic']) for d in data}
예제 #18
0
파일: nlp.py 프로젝트: TheoSeo93/sematch
 def load(cls, feature_dict_file='models/entity_features.json'):
     from sematch.utility import FileIO
     entity_features = FileIO.read_json_file(feature_dict_file)
     entity_features = {
         e['dbr']: (e['desc'], e['cat'])
         for e in entity_features
     }
     return cls(entity_features)
예제 #19
0
 def graph_ic_reader(self, filename):
     """
     Load the saved IC values
     :param filename: the file containing IC values of concepts
     :return: a dictionary concept:IC
     """
     data = FileIO.read_json_file(filename)
     return {d['concept']: float(d['ic']) for d in data}
예제 #20
0
 def __init__(self, src='models/dbpedia_2015-04.owl'):
     self.graph = rdflib.Graph()
     self.graph.parse(FileIO.filename(src))
     self.root = 'http://www.w3.org/2002/07/owl#Thing'
     self.classes = [s for s in self.graph.subjects(RDF.type, OWL.Class)]
     self.o_properties = [s for s in self.graph.subjects(RDF.type, OWL.ObjectProperty)]
     self.d_properties = [s for s in self.graph.subjects(RDF.type, OWL.DatatypeProperty)]
     self.uri2class = {c.toPython():c for c in self.classes}
     self.uri2class[self.root] = rdflib.URIRef(self.root)
     self.class_labels = [self.token(c) for c in self.classes]
예제 #21
0
    def __init__(self,
                 vec_file='models/GoogleNews-vectors-negative300.bin',
                 binary=True):
        """

        :param vec_file: the file storing vectors
        :param binary: if vector are stored in binary. Google news use binary while yelp not
        """
        self._wordvec = Word2Vec.load_word2vec_format(
            FileIO.filename(vec_file), binary=binary)
예제 #22
0
 def __init__(self,
              graph_ic='models/yago_type_ic.txt',
              mappings="models/type-linkings.txt"):
     WordNetSimilarity.__init__(self)
     self._graph_ic = GraphIC(graph_ic)
     self._mappings = FileIO.read_json_file(mappings)
     self._id2mappings = {data['offset']: data for data in self._mappings}
     self._yago2id = {
         data['yago_dbpedia']: data['offset']
         for data in self._mappings
     }
예제 #23
0
    def load_result(self, sim_name, dataset_name):
        """
        This function loads the result of a similarity metric for a specific dataset

        :param sim_name: the name similarity metric
        :param dataset_name: the name of word similarity dataset
        :return: cor relation score and rating scores generated by similarity metric
        """
        data = FileIO.read_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name))
        data = list(map(float, data))
        return data[0], data[1:]
예제 #24
0
    def load_result(self, sim_name, dataset_name):
        """
        This function loads the result of a similarity metric for a specific dataset

        :param sim_name: the name similarity metric
        :param dataset_name: the name of word similarity dataset
        :return: cor relation score and rating scores generated by similarity metric
        """
        data = FileIO.read_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name))
        data = map(float, data)
        return data[0], data[1:]
예제 #25
0
    def load_dataset(self, dataset_name):
        """
         This function loads the word similarity dataset

        :param dataset_name: the file name of word similarity dataset
        :return: word pairs and huamn ratings
        """
        data = FileIO.read_list_file('dataset/wordsim/%s.txt' % dataset_name)
        #print "dataset ", dataset_name, " ", len(data), " word pairs"
        word_pairs = map(lambda x: (x.split()[0], x.split()[1]), data)
        human = list(map(float, map(lambda x: x.split()[2], data)))
        return word_pairs, human
예제 #26
0
    def load_dataset(self, dataset_name):
        """
         This function loads the word similarity dataset

        :param dataset_name: the file name of word similarity dataset
        :return: word pairs and huamn ratings
        """
        data = FileIO.read_list_file('eval/word_similarity/%s.txt' % dataset_name)
        #print "dataset ", dataset_name, " ", len(data), " word pairs"
        word_pairs = map(lambda x: (x.split()[0], x.split()[1]), data)
        human = map(float, map(lambda x: x.split()[2], data))
        return word_pairs, human
예제 #27
0
 def load_dataset(self, dataset_file, cat_full=False):
     from BeautifulSoup import BeautifulSOAP as bs
     pairs = []
     with open(FileIO.filename(dataset_file), 'r') as f:
         corpus = f.read()
         opinions = bs(corpus).findAll('opinion')
         for op in opinions:
             if not op['target'] == 'NULL':
                 t = op['target']
                 c = op['category'] if cat_full else op['category'].split('#')[0]
                 pairs.append((t, c))
     X, y = zip(*pairs)
     return X, y
예제 #28
0
파일: nlp.py 프로젝트: TheoSeo93/sematch
 def candidate_features(cls,
                        candidates,
                        export_file='models/candidate_features.json',
                        feature_dict_file='models/entity_features.json'):
     from sematch.utility import FileIO
     entity_features = FileIO.read_json_file(feature_dict_file)
     entity_features = {
         e['dbr']: (e['desc'], e['cat'])
         for e in entity_features
     }
     features = []
     for i, can in enumerate(candidates):
         print i, " ", can
         data = {}
         data['dbr'] = can
         data['desc'] = entity_features[can][
             0] if can in entity_features else None
         data['cat'] = entity_features[can][
             1] if can in entity_features else []
         features.append(data)
     FileIO.save_json_file(export_file, features)
     return features
예제 #29
0
 def load_dataset(self, dataset_file):
     """
     Generate sentence pairs.
     :param dataset_file: dataset file
     :return: sentence pairs
     """
     data = FileIO.read_list_file(dataset_file)
     data = [d.strip() for d in data]
     corpus = []
     for d in data:
         item = d.split('\t')
         corpus.append((item[0], item[1]))
     return corpus
예제 #30
0
 def load_dataset(self, dataset_file):
     """
     Generate sentence pairs.
     :param dataset_file: dataset file
     :return: sentence pairs
     """
     data = FileIO.read_list_file(dataset_file)
     data = [d.strip() for d in data]
     corpus = []
     for d in data:
         item = d.split('\t')
         corpus.append((item[0], item[1]))
     return corpus
예제 #31
0
파일: test_nlp.py 프로젝트: zflgte/sematch
def test_feature_extractor():
    from sematch.nlp import FeatureExtractor
    from sematch.nlp import EntityFeature
    from sematch.nlp import SpaCyNLP
    from sematch.utility import FileIO
    import itertools
    sy = SpaCyNLP()
    w_extractor = FeatureExtractor(sy.pos_tag)
    features = EntityFeature.load(feature_dict_file='models/query_features.json')
    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    candidates = list(itertools.chain.from_iterable(map(lambda x: x['candidate'], query)))
    set_candidates = list(set(candidates))
    for can in set_candidates[:10]:
        print w_extractor.entity_word_features([can], features)
예제 #32
0
 def __init__(self, src='models/dbpedia_2015-04.owl'):
     self.graph = rdflib.Graph()
     self.graph.parse(FileIO.filename(src))
     self.root = 'http://www.w3.org/2002/07/owl#Thing'
     self.classes = [s for s in self.graph.subjects(RDF.type, OWL.Class)]
     self.o_properties = [
         s for s in self.graph.subjects(RDF.type, OWL.ObjectProperty)
     ]
     self.d_properties = [
         s for s in self.graph.subjects(RDF.type, OWL.DatatypeProperty)
     ]
     self.uri2class = {c.toPython(): c for c in self.classes}
     self.uri2class[self.root] = rdflib.URIRef(self.root)
     self.class_labels = [self.token(c) for c in self.classes]
예제 #33
0
 def load_dataset(self, dataset_file, cat_full=False):
     from BeautifulSoup import BeautifulSOAP as bs
     pairs = []
     with open(FileIO.filename(dataset_file), 'r') as f:
         corpus = f.read()
         opinions = bs(corpus).findAll('opinion')
         for op in opinions:
             if not op['target'] == 'NULL':
                 t = op['target']
                 c = op['category'] if cat_full else op['category'].split(
                     '#')[0]
                 pairs.append((t, c))
     X, y = zip(*pairs)
     return X, y
예제 #34
0
파일: test_ned.py 프로젝트: zflgte/sematch
def test_query_ned():
    from sematch.nlp import FeatureExtractor
    from sematch.nlp import EntityFeature
    from sematch.nlp import SpaCyNLP
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import TextRelatedness
    from sematch.nel import EntityDisambiguation
    import itertools
    sy = SpaCyNLP()
    features = EntityFeature.load(
        feature_dict_file='models/query_features.json')
    extractor = FeatureExtractor(features, sy.pos_tag)
    ned = EntityDisambiguation(extractor)
    rel = TextRelatedness()
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    #print wns.word_similarity('cooling', 'air_conditioner', 'li')
    #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa')

    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    query = [q for q in query if extractor.context_features(q['query'])]
    print len(query)
    import warnings
    warnings.filterwarnings("ignore")
    metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath']
    for m in metrics:
        print m
        similarity = lambda x, y: wns.word_similarity(x, y, m)
        for k in range(1, 21):
            gold = []
            predict = []
            for q in query:
                gold.append(q['gold'])
                #e = ned.text_disambiguate(q['query'], q['candidate'], similarity)
                e = ned.word_disambiguate(q['query'],
                                          q['candidate'],
                                          similarity,
                                          K=k)
                predict.append(e)
            from sklearn.metrics import precision_recall_fscore_support
            #from sklearn.metrics import classification_report
            #print classification_report(gold, predict)
            print precision_recall_fscore_support(gold,
                                                  predict,
                                                  average='weighted')[2]
예제 #35
0
파일: test_ned.py 프로젝트: gsi-upm/sematch
def test_query_ned():
    from sematch.nlp import FeatureExtractor
    from sematch.nlp import EntityFeature
    from sematch.nlp import SpaCyNLP
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import TextRelatedness
    from sematch.nel import EntityDisambiguation
    import itertools
    sy = SpaCyNLP()
    features = EntityFeature.load(feature_dict_file='models/query_features.json')
    extractor = FeatureExtractor(features, sy.pos_tag)
    ned = EntityDisambiguation(extractor)
    rel = TextRelatedness()
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    #print wns.word_similarity('cooling', 'air_conditioner', 'li')
    #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa')

    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    query = [q for q in query if extractor.context_features(q['query'])]
    print len(query)
    import warnings
    warnings.filterwarnings("ignore")
    metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath']
    for m in metrics:
        print m
        similarity = lambda x, y: wns.word_similarity(x, y, m)
        for k in range(1, 21):
            gold = []
            predict = []
            for q in query:
                gold.append(q['gold'])
                #e = ned.text_disambiguate(q['query'], q['candidate'], similarity)
                e = ned.word_disambiguate(q['query'], q['candidate'], similarity, K=k)
                predict.append(e)
            from sklearn.metrics import precision_recall_fscore_support
            #from sklearn.metrics import classification_report
            #print classification_report(gold, predict)
            print precision_recall_fscore_support(gold, predict, average='weighted')[2]
예제 #36
0
파일: nlp.py 프로젝트: gsi-upm/sematch
 def load(cls, name_dict_file='models/name.dict'):
     from sematch.utility import FileIO
     name = FileIO.read_json_file(name_dict_file)
     name = {n['name']: n['concepts'] for n in name}
     return cls(name)
예제 #37
0
파일: nlp.py 프로젝트: gsi-upm/sematch
 def load(cls, feature_dict_file='models/entity_features.json'):
     from sematch.utility import FileIO
     entity_features = FileIO.read_json_file(feature_dict_file)
     entity_features = {e['dbr']: (e['desc'], e['cat']) for e in entity_features}
     return cls(entity_features)
예제 #38
0
파일: nlp.py 프로젝트: TheoSeo93/sematch
 def load(cls, name_dict_file='models/name.dict'):
     from sematch.utility import FileIO
     name = FileIO.read_json_file(name_dict_file)
     name = {n['name']: n['concepts'] for n in name}
     return cls(name)
예제 #39
0
 def load_stopwords(self, filename):
     data = FileIO.read_list_file(FileIO.filename(filename))
     data = [d.split() for d in data[1:]] # skip first line, in case more than one word per line
     data = list(itertools.chain.from_iterable(data))
     return data
예제 #40
0
 def __init__(self, graph_ic='models/yago_type_ic.txt', mappings="models/type-linkings.txt"):
     WordNetSimilarity.__init__(self)
     self._graph_ic = GraphIC(graph_ic)
     self._mappings = FileIO.read_json_file(mappings)
     self._id2mappings = {data['offset']: data for data in self._mappings}
     self._yago2id = {data['yago_dbpedia']: data['offset'] for data in self._mappings}
예제 #41
0
 def load_dataset(self):
     data = FileIO.read_json_file('dataset/aspect/data.txt')
     X, y = zip(*[(d['text'], d['label']) for d in data])
     return X, y
예제 #42
0
 def load_dataset(self):
     data = FileIO.read_json_file('dataset/aspect/data.txt')
     X, y = zip(*[(d['text'], d['label']) for d in data])
     return X, y
예제 #43
0
파일: nlp.py 프로젝트: zxch3n/sematch
 def load_stopwords(self, filename):
     data = FileIO.read_list_file(FileIO.filename(filename))
     data = [d.split() for d in data[1:]] # skip first line, in case more than one word per line
     data = list(itertools.chain.from_iterable(data))
     return data