예제 #1
0
def get_comparison(news_org, NP_to_sentence, VP_to_sentence,
                   NPs, VPs, NP_synsets, VP_synsets,
                   article_topic, article_headline, article_news_org):
  '''Compares the articles from a single NewsOrg to an article that is
  represented by its NPs and VPs.'''
  # synsets aren't picklable so they're stored as (pos, offset) and unpacked
  NP_synsets = [wn._synset_from_pos_and_offset(pos, offset)
                for (pos, offset) in NP_synsets]
  VP_synsets = [wn._synset_from_pos_and_offset(pos, offset)
                for (pos, offset) in VP_synsets]

  comparison_articles = news_org.get_query_results(article_topic)
  if not comparison_articles:
    logger.log.warning("No comparison articles for %s" % news_org)
    return []
  comparisons = []
  for comparison_article in comparison_articles:
    if (news_org == article_news_org and
        comparison_article.headline == article_headline):
      # comparison_article is likely the same as the original article
      # do not compare
      pass
    try:
      comparison = compare_articles.compare_articles(NP_to_sentence,
                                                     VP_to_sentence, NPs, VPs,
                                                     NP_synsets, VP_synsets,
                                                     comparison_article)
      if comparison:
        comparisons.append(comparison)
    except:
      continue
  return comparisons
예제 #2
0
def get_comparison(news_org, article_topic, NP_to_sentence, VP_to_sentence,
                   NPs, VPs, NP_synsets, VP_synsets, article):
  '''Compares the articles from a single NewsOrg to an article that is
  represented by its NPs and VPs.'''
  # synsets aren't picklable so they're stored as (pos, offset) and unpacked
  NP_synsets = [wn._synset_from_pos_and_offset(pos, offset)
                for (pos, offset) in NP_synsets]
  VP_synsets = [wn._synset_from_pos_and_offset(pos, offset)
                for (pos, offset) in VP_synsets]

  comparison_articles = news_org.get_query_results(article_topic)
  if not comparison_articles:
    return []
  comparisons = []
  for comparison_article in comparison_articles:
    try:
      comparison = compare_articles.compare_articles(NP_to_sentence,
                                                     VP_to_sentence, NPs, VPs,
                                                     NP_synsets, VP_synsets,
                                                     comparison_article)
      if comparison:
        comparisons.append(comparison)
    except:
      continue
  return comparisons
예제 #3
0
def loadAll():
    if loaded:
        return
    index = wn._lemma_pos_offset_map
    print 'loading wordnet into cache... '
    cache = wn._synset_offset_cache
    f = open('pos_offset.txt','r')
    for line in f:
        ll = line.split()
        pos = ll[0]
        offset = int(ll[1])
        wn._synset_from_pos_and_offset(pos,offset)
    print 'Done: '+str(sum([len(cache[x]) for x in cache]))+'/'+str(len(index))
    loaded = True
예제 #4
0
 def random_le_and_sy(self):
     '''
     '''
     from nltk.corpus import wordnet as wn
     
     start_at = random.choice( range( len(self.orbn_ids)))
     
     for counter,le_obj in enumerate(self.les_get_generator()):
         
         if counter >= start_at:
             print()
             print(etree.tostring(le_obj.le_el,
                                  pretty_print=True))
             answer = input('interesting? ')
             if answer == 'y':
                 target = le_obj.get_synset_id()
                 eng,version,offset,pos = target.split('-')
                 sy_obj = self.synsets_find_synset(target)
                 print()
                 print(etree.tostring(sy_obj.synset_el,
                                      pretty_print=True))
                 synset = wn._synset_from_pos_and_offset(pos,int(offset))
                 print(synset.lemmas())
                 print(synset.definition())
                 input('continue?')
예제 #5
0
 def all_senti_synsets(self):
     from nltk.corpus import wordnet as wn
     for key, fields in self._db.items():
         pos, offset = key
         pos_score, neg_score = fields
         synset = wn._synset_from_pos_and_offset(pos, offset)
         yield SentiSynset(pos_score, neg_score, synset)
예제 #6
0
 def __getitem__(self, k):
     for pos in ("n", "v", "a", "r"):
         try:
             synset = wn._synset_from_pos_and_offset(pos, k)
         except:
             pass
         if synset:
             return synset
     return None
예제 #7
0
파일: __init__.py 프로젝트: clips/pattern
 def __getitem__(self, k):
     for pos in ("n", "v", "a", "r"):
         try:
             synset = wn._synset_from_pos_and_offset(pos, k)
         except:
             pass
         if synset:
             return synset
     return None
def offset_to_synset(offset):
    """ 
    Look up a synset given offset-pos 
    >>> synset = offset_to_synset('02614387-v')
    >>> print '%08d-%s' % (synset.offset, synset.pos)
    >>> print synset, synset.definition
    02614387-v
    Synset('live.v.02') lead a certain kind of life; live in a certain style
    """
    return wn._synset_from_pos_and_offset(str(offset[-1:]), int(offset[:8]))
예제 #9
0
 def tag_by_synset(self, pos, offset):
     if not (pos and offset): return None
     
     s = wn._synset_from_pos_and_offset(pos, offset)
     
     # outputtin verb's paths
     if pos == 'v':
         print s.hypernym_paths()
     
     return self._tagIt(s);
예제 #10
0
def get_lemma(synset_id):
    try:
        wn_id = synset_id.split(':')[1]
        offset = int(wn_id[:-1])
        pos = wn_id[-1]
        print offset
        print pos
        return wordnet._synset_from_pos_and_offset(pos, offset)
    except:
        log.error("get_lemma(): error looking up synset id {0} in NLTK WordNet".format(synset_id))
        return None
예제 #11
0
 def synsets_from_txt(self, fname):
     with open(fname, 'rb') as f:
         lines = f.readlines()
     df = []
     for line in lines:
         w = line.split()[0]
         descr = line.strip('\r\n').replace(w + ' ', '')
         synset = wn._synset_from_pos_and_offset(w[0], int(w[1:]))
         df.append({'id': w, 'names': descr, 'synset': synset})
     # df = pandas.DataFrame(df, columns=['id', 'names', 'synset'])
     return df
예제 #12
0
def offset_to_synset(offset):
    """
    Look up a synset given offset-pos
    (Thanks for @FBond, see http://moin.delph-in.net/SemCor)
    >>> synset = offset_to_synset('02614387-v')
    >>> print '%08d-%s' % (synset.offset, synset.pos)
    >>> print synset, synset.definition
    02614387-v
    Synset('live.v.02') lead a certain kind of life; live in a certain style
    """
    return wn._synset_from_pos_and_offset(str(offset[-1:]), int(offset[:8]))
예제 #13
0
파일: __init__.py 프로젝트: clips/pattern
    def synset(self, id, pos=ADJECTIVE):
        if pos in _pattern2wordnet:
            pos = _pattern2wordnet[pos]
        try:
            s = wn._synset_from_pos_and_offset(pos, id)
            lemma = s.lemma_names()[0]
            return self[lemma]
        except:
            pass

        return None
예제 #14
0
def offset_to_synset(offset):
    """ 
    Look up a synset given offset-pos 
    (Thanks for @FBond, see http://moin.delph-in.net/SemCor)
    >>> synset = offset_to_synset('02614387-v')
    >>> print '%08d-%s' % (synset.offset, synset.pos)
    >>> print synset, synset.definition
    02614387-v
    Synset('live.v.02') lead a certain kind of life; live in a certain style
    """
    return wn._synset_from_pos_and_offset(str(offset[-1:]), int(offset[:8]))
예제 #15
0
파일: __init__.py 프로젝트: jhpyle/pattern
    def synset(self, id, pos=ADJECTIVE):
        if pos in _pattern2wordnet:
            pos = _pattern2wordnet[pos]
        try:
            s = wn._synset_from_pos_and_offset(pos, id)
            lemma = s.lemma_names()[0]
            return self[lemma]
        except:
            pass

        return None
예제 #16
0
파일: base.py 프로젝트: qbilius/conv-exp
 def synsets_from_txt(self, fname):
     with open(fname, "rb") as f:
         lines = f.readlines()
     df = []
     for line in lines:
         w = line.split()[0]
         descr = line.strip("\r\n").replace(w + " ", "")
         synset = wn._synset_from_pos_and_offset(w[0], int(w[1:]))
         df.append({"id": w, "names": descr, "synset": synset})
     # df = pandas.DataFrame(df, columns=['id', 'names', 'synset'])
     return df
예제 #17
0
파일: run.py 프로젝트: mageed/conv-exp
    def synsets_from_csv(self, fname):
        sf = pandas.read_csv(fname, sep='\t')
        df = []
        for idx, row in sf.iterrows():
            idd = row['synset_id']
            try:
                synset = wn._synset_from_pos_and_offset(idd[0], int(idd[1:]))
            except:
                import pdb; pdb.set_trace()

            df.append({'id':idd, 'names':row['name'], 'synset':synset})
        return df
예제 #18
0
    def pred_acc(self, compute_acc=True):
        if compute_acc:
            preds = self.predict()
        imagenet_labels = self.synsets_from_txt('synset_words.txt')
        dataset_labels = self.synsets_from_csv(
            os.path.join(self.exp, 'data', self.exp + '.csv'))
        all_hyps = lambda s: s.hyponyms()

        df = pandas.DataFrame.from_dict(dataset_labels)
        df['imgid'] = ''
        df['imdnames'] = ''
        df['kind'] = 'unknown'
        df['accuracy'] = np.nan
        df['accuracy0'] = np.nan
        df['confidence0'] = np.nan
        for no, dtlab in enumerate(dataset_labels):
            hypos = set([i for i in dtlab['synset'].closure(all_hyps)])
            hypos = hypos.union([dtlab['synset']])
            for imglab in imagenet_labels:
                if imglab['synset'] in hypos:
                    df.loc[no, 'imgid'] = imglab['id']
                    df.loc[no, 'imgnames'] = imglab['names']
                    if imglab['id'] == df.loc[no, 'id']:
                        df.loc[no, 'kind'] = 'exact'
                    else:
                        df.loc[no, 'kind'] = 'superordinate'
                    break
            if compute_acc:
                acc = False
                acc0 = False
                for i, p in enumerate(preds[no]):
                    psyn = wn._synset_from_pos_and_offset(
                        p['synset'][0], int(p['synset'][1:]))
                    df.loc[no, 'pred%d' % i] = ', '.join(psyn.lemma_names())
                    # check if the prediction is exact
                    # or at least more specific than the correct resp
                    if psyn in hypos:
                        acc = True
                    if i == 0:
                        if psyn in hypos:
                            acc0 = True
                if acc == False:
                    if df.loc[no, 'kind'] != 'unknown':
                        df.loc[no, 'accuracy'] = False
                else:
                    df.loc[no, 'accuracy'] = True
                if acc0 == False:
                    if df.loc[no, 'kind'] != 'unknown':
                        df.loc[no, 'accuracy0'] = False
                else:
                    df.loc[no, 'accuracy0'] = True
                df.loc[no, 'confidence0'] = preds[no][0]['confidence']
        return df
예제 #19
0
    def synsets_from_csv(self, fname):
        sf = pandas.read_csv(fname, sep='\t')
        df = []
        for idx, row in sf.iterrows():
            idd = row['synset_id']
            try:
                synset = wn._synset_from_pos_and_offset(idd[0], int(idd[1:]))
            except:
                import pdb
                pdb.set_trace()

            df.append({'id': idd, 'names': row['name'], 'synset': synset})
        return df
예제 #20
0
def get_lemma(synset_id):
    try:
        wn_id = synset_id.split(':')[1]
        offset = int(wn_id[:-1])
        pos = wn_id[-1]
        print offset
        print pos
        return wordnet._synset_from_pos_and_offset(pos, offset)
    except:
        log.error(
            "get_lemma(): error looking up synset id {0} in NLTK WordNet".
            format(synset_id))
        return None
예제 #21
0
파일: base.py 프로젝트: qbilius/conv-exp
    def pred_acc(self, compute_acc=True):
        if compute_acc:
            preds = self.predict()
        imagenet_labels = self.synsets_from_txt("synset_words.txt")
        dataset_labels = self.synsets_from_csv(os.path.join(self.exp, "data", self.exp + ".csv"))
        all_hyps = lambda s: s.hyponyms()

        df = pandas.DataFrame.from_dict(dataset_labels)
        df["imgid"] = ""
        df["imdnames"] = ""
        df["kind"] = "unknown"
        df["accuracy"] = np.nan
        df["accuracy0"] = np.nan
        df["confidence0"] = np.nan
        for no, dtlab in enumerate(dataset_labels):
            hypos = set([i for i in dtlab["synset"].closure(all_hyps)])
            hypos = hypos.union([dtlab["synset"]])
            for imglab in imagenet_labels:
                if imglab["synset"] in hypos:
                    df.loc[no, "imgid"] = imglab["id"]
                    df.loc[no, "imgnames"] = imglab["names"]
                    if imglab["id"] == df.loc[no, "id"]:
                        df.loc[no, "kind"] = "exact"
                    else:
                        df.loc[no, "kind"] = "superordinate"
                    break
            if compute_acc:
                acc = False
                acc0 = False
                for i, p in enumerate(preds[no]):
                    psyn = wn._synset_from_pos_and_offset(p["synset"][0], int(p["synset"][1:]))
                    df.loc[no, "pred%d" % i] = ", ".join(psyn.lemma_names())
                    # check if the prediction is exact
                    # or at least more specific than the correct resp
                    if psyn in hypos:
                        acc = True
                    if i == 0:
                        if psyn in hypos:
                            acc0 = True
                if acc == False:
                    if df.loc[no, "kind"] != "unknown":
                        df.loc[no, "accuracy"] = False
                else:
                    df.loc[no, "accuracy"] = True
                if acc0 == False:
                    if df.loc[no, "kind"] != "unknown":
                        df.loc[no, "accuracy0"] = False
                else:
                    df.loc[no, "accuracy0"] = True
                df.loc[no, "confidence0"] = preds[no][0]["confidence"]
        return df
예제 #22
0
    def create_obj_scale_dictionary(obj_names, scales_dic, key_type='synset'):
        """
        Creates a dictionary containing association between objects in the list_aws file and their synset names given
        A dictionary of desired category<>scale or synset<>scale associations.
        Category names can be retrieved from the 'shapenet_synset' field on the shapenet meta db.
        Example of such categories are: 'airplane', 'bag', 'bathtub', 'train'

        :param obj_names: list containing the object names
        :param scales_dic: A dictionary containing scale values for particualr class of objects
        :param key_type: Type of keys in scales_dic. Could be 'synset' (Default) or 'category'.
        :return: association dictionary with object names as keys
        """
        try:
            print('Getting synset offsets from MongoDB...')
            db_client = pm.MongoClient(port=22334)
            table = db_client['synthetic_generative']['3d_models']
            cursor = table.find({'type': 'shapenet',
                                 'version': 2,
                                 'id': {'$in': obj_names}})

            if key_type == 'category':
                obj_scale_dic = dict()  # Stores the table for id-scale correspondence
                for doc in cursor:
                    offset = doc['shapenet_synset']
                    synset_name = wn._synset_from_pos_and_offset(offset[0], int(offset[1:])).name()
                    basic_name = synset_name.split('.')[0]  # Take the first part of the synset name
                    if basic_name in scales_dic:
                        obj_scale_dic["http://threedworld.s3.amazonaws.com/"+doc['id']+'.bundle'] = \
                            {'option': 'Multi_size', 'scale': scales_dic[basic_name]}
                    else:
                        obj_scale_dic["http://threedworld.s3.amazonaws.com/"+doc['id']+'.bundle'] = \
                            {'option': 'Multi_size', 'scale': 1}
            elif key_type == 'synset':
                obj_scale_dic = dict()  # Stores the table for id-scale correspondence
                for doc in cursor:
                    offset = doc['shapenet_synset']
                    if offset in scales_dic:
                        obj_scale_dic["http://threedworld.s3.amazonaws.com/" + doc['id'] + '.bundle'] = \
                            {'option': 'Multi_size', 'scale': scales_dic[offset]}
                    else:
                        obj_scale_dic["http://threedworld.s3.amazonaws.com/" + doc['id'] + '.bundle'] = \
                            {'option': 'Multi_size', 'scale': 1}
            else:
                raise ValueError("Key type should be either: 'synset' or 'category'.")
            print('Table created!')
            db_client.close()
        except:
            print('Could not connect to DB. Create a SSH tunnel.')
            raise

        return obj_scale_dic
예제 #23
0
def offset2ss(offset_31, wn_31_30):
    try:
        # convert wordnet 3.1 id to 3.0 because NLTK limited to 3.0 for now
        offset_30 = wn31_30[offset_31]
        synset = wn._synset_from_pos_and_offset(str(offset_30[-1:]),
                                                int(offset_30[:8]))
    # print "offset_31:", offset_31, "\toffset_30:", offset_30, "\tsynset:", synset
    # print type(wn._synset_from_pos_and_offset(str(offset_30[-1:]), int(offset_30[:8])))
    except:
        # print "key", offset_31, "is not mapped.."           #handling the mapping issue between wn3.1 and wn3.0
        # print "score given is 0"
        unmapped_keys.add(offset_31)
        synset = None
    return synset
예제 #24
0
def examine_synset(s):
    m = re.search(r'^(?:eng-30-)?(\d+)-(\w)', s)
    if m:
        ss = wn._synset_from_pos_and_offset(m.group(2), int(m.group(1)))
        print(ss)
        if not re.search(r'^eng-30-', s):
            s = 'eng-30-' + s
    if s in model.wv.vocab:
        print('Nearest neighbors:')
        for word, score in model.wv.most_similar(s):
            print('\t%s\t%.3f' % (word, score))
    else:
        print('Not found in the vocabulary.')
    print()
예제 #25
0
 def senti_synset(self, *vals):        
     if tuple(vals) in self.db:
         pos_score, neg_score = self.db[tuple(vals)]
         pos, offset = vals
         synset = wn._synset_from_pos_and_offset(pos, offset)  # @UndefinedVariable
         return SentiSynset(pos_score, neg_score, synset)
     else:
         synset = wn.synset(vals[0])  # @UndefinedVariable
         pos = synset.pos
         offset = synset.offset
         if (pos, offset) in self.db:
             pos_score, neg_score = self.db[(pos, offset)]
             return SentiSynset(pos_score, neg_score, synset)
         else:
             return None
예제 #26
0
 def senti_synset(self, *vals):        
     if tuple(vals) in self.db:
         pos_score, neg_score = self.db[tuple(vals)]
         pos, offset = vals
         synset = wordnet._synset_from_pos_and_offset(pos, offset)
         return SentiSynset(pos_score, neg_score, synset)
     else:
         synset = wordnet.synset(vals[0])
         pos = synset.pos
         offset = synset.offset
         if (pos, offset) in self.db:
             pos_score, neg_score = self.db[(pos, offset)]
             return SentiSynset(pos_score, neg_score, synset)
         else:
             return None
예제 #27
0
 def senti_synset(self, *vals):        
     if tuple(vals) in self.db:
         pos_score, neg_score = self.db[tuple(vals)]
         pos, offset = vals
         synset = wn._synset_from_pos_and_offset(pos, offset)
         return SentiSynset(pos_score, neg_score, synset)
     else:
         synset = wn.synset(vals[0])
         pos = synset.pos
         offset = synset.offset
         if (pos, offset) in self.db:
             pos_score, neg_score = self.db[(pos, offset)]
             return SentiSynset(pos_score, neg_score, synset)
         else:
             return None
예제 #28
0
파일: __init__.py 프로젝트: clips/pattern
    def __init__(self, synset):
        """ A set of synonyms that share a common meaning.
        """
        if isinstance(synset, WordNetSynset):
            self._wnsynset = synset
        elif isinstance(synset, Synset):
            self = self
        elif isinstance(synset, (tuple, int)):
            if isinstance(synset, int):
                synset = (synset, "NN")
            offset, pos = synset
            self._wnsynset = wn._synset_from_pos_and_offset(_pattern2wordnet[pos] if pos in _pattern2wordnet else pos, offset)
        else:
            raise NotImplementedError

        self._synset = _synset
예제 #29
0
파일: base.py 프로젝트: qbilius/conv-exp
    def synsets_from_csv(self, fname, sep=","):
        with open(fname, "rb") as f:
            lines = f.readlines()
        df = []
        for line in lines:
            spl = line.strip("\n").split(sep)
            try:
                synset = wn._synset_from_pos_and_offset(spl[0][0], int(spl[0][1:]))
            except:
                import pdb

                pdb.set_trace()

            df.append({"id": spl[0], "names": spl[1], "synset": synset})
        # df = pandas.DataFrame(df, columns=['id', 'names', 'synset'])
        return df
예제 #30
0
    def synsets_from_csv(self, fname, sep=','):
        with open(fname, 'rb') as f:
            lines = f.readlines()
        df = []
        for line in lines:
            spl = line.strip('\n').split(sep)
            try:
                synset = wn._synset_from_pos_and_offset(
                    spl[0][0], int(spl[0][1:]))
            except:
                import pdb
                pdb.set_trace()

            df.append({'id': spl[0], 'names': spl[1], 'synset': synset})
        # df = pandas.DataFrame(df, columns=['id', 'names', 'synset'])
        return df
예제 #31
0
파일: __init__.py 프로젝트: jhpyle/pattern
    def __init__(self, synset):
        """ A set of synonyms that share a common meaning.
        """
        if isinstance(synset, WordNetSynset):
            self._wnsynset = synset
        elif isinstance(synset, Synset):
            self = self
        elif isinstance(synset, (tuple, int)):
            if isinstance(synset, int):
                synset = (synset, "NN")
            offset, pos = synset
            self._wnsynset = wn._synset_from_pos_and_offset(_pattern2wordnet[pos] if pos in _pattern2wordnet else pos, offset)
        else:
            raise NotImplementedError

        self._synset = _synset
예제 #32
0
 def senti_synset(self, *vals):        
     from nltk.corpus import wordnet as wn
     if tuple(vals) in self._db:
         pos_score, neg_score = self._db[tuple(vals)]
         pos, offset = vals
         synset = wn._synset_from_pos_and_offset(pos, offset)
         return SentiSynset(pos_score, neg_score, synset)
     else:
         synset = wn.synset(vals[0])
         pos = synset.pos()
         offset = synset.offset()
         if (pos, offset) in self._db:
             pos_score, neg_score = self._db[(pos, offset)]
             return SentiSynset(pos_score, neg_score, synset)
         else:
             return None
예제 #33
0
 def senti_synset(self, *vals):        
     from nltk.corpus import wordnet as wn
     if tuple(vals) in self._db:
         pos_score, neg_score = self._db[tuple(vals)]
         pos, offset = vals
         synset = wn._synset_from_pos_and_offset(pos, offset)
         return SentiSynset(pos_score, neg_score, synset)
     else:
         synset = wn.synset(vals[0])
         pos = synset.pos()
         offset = synset.offset()
         if (pos, offset) in self._db:
             pos_score, neg_score = self._db[(pos, offset)]
             return SentiSynset(pos_score, neg_score, synset)
         else:
             return None
예제 #34
0
 def senti_synset(self, *vals):
     if tuple(vals) in self.db:
         print "It is here"
         pos_score, neg_score = self.db[tuple(vals)]
         pos, offset = vals
         synset = wn._synset_from_pos_and_offset(pos, offset)
         return SentiSynset(pos_score, neg_score, synset)
     else:
         print "No it is here."
         synset = wn.synset(vals[0])
         pos = synset.pos
         offset = synset.offset
         if (pos, offset) in self.db:
             pos_score, neg_score = self.db[(pos, offset)]
             return SentiSynset(pos_score, neg_score, synset)
         else:
             return None
예제 #35
0
파일: demo.py 프로젝트: dheera/deeptrash
def predict(img):
    img = np.swapaxes(img, 0, 2)
    img = np.swapaxes(img, 1, 2)
    img = img[np.newaxis, :]
    # compute the predict probabilities
    mod.forward(Batch([mx.nd.array(img)]))
    prob = mod.get_outputs()[0].asnumpy()
    # print the top-5
    prob = np.squeeze(prob)
    a = np.argsort(prob)[::-1]
    for i in a[0:5]:
        print('index=%d, probability=%f, class=%s' % (i, prob[i], labels[i]))
        offset = int(labels[i].split(' ')[0].strip('n'))
        synset = wordnet._synset_from_pos_and_offset('n', offset)
        paths = synset.hypernym_paths()
        paths = list(map(lambda x: '/'.join(map(lambda y: y.name(), x)),
                         paths))
예제 #36
0
    def senti_synset(self, *vals):
        print ">>> @senti_synset"
        print ">>> vals: " + str(vals)

        if tuple(vals) in self.db:
            pos_score, neg_score = self.db[tuple(vals)]
            pos, offset = vals
            synset = wn._synset_from_pos_and_offset(pos, offset)
            return SentiSynset(pos_score, neg_score, synset)
        else:
            synset = wn.synset(vals[0])
            pos = synset.pos
            offset = synset.offset
            if (pos, offset) in self.db:
                pos_score, neg_score = self.db[(pos, offset)]
                return SentiSynset(pos_score, neg_score, synset)
            else:
                return None
예제 #37
0
 def senti_synset(self, *vals):   
     '''
     get pos and neg scores for word
     input: vals is a word or word, pos tag
     output: return an object SentiSynset with word, pos score and neg score set in the object     
     '''
     if tuple(vals) in self.db:
         pos_score, neg_score = self.db[tuple(vals)]
         pos, offset = vals
         synset = wn._synset_from_pos_and_offset(pos, offset)
         return SentiSynset(pos_score, neg_score, synset)
     else:
         synset = wn.synset(vals[0])
         pos = synset.pos
         offset = synset.offset
         if (pos, offset) in self.db:
             pos_score, neg_score = self.db[(pos, offset)]
             return SentiSynset(pos_score, neg_score, synset)
         else:
             return None
예제 #38
0
    def __init__(self, steps=2):
        with open("data/9k.labels", "r") as _labels:
            for l in _labels:
                synset = wn._synset_from_pos_and_offset('n', int(l[1:]))
                self.original_synsets.append(synset)

        #print("all size: ", len(synsets))
        self.all_hypernyms = self.reduce(self.original_synsets, steps)
        #print("hypernyms size:", len(self.all_hypernyms))
        print("reduced", len(self.original_synsets), "to", len(self.all_hypernyms))

        for i in range(len(self.original_synsets)):
            ss_name = self.original_synsets[i]

            ss_hypernyms = self.get_hypernyms(ss_name)
            ss_hypernyms_indices = []
            for h in ss_hypernyms:
                ss_hypernyms_indices.append(self.get_new_index(h))

            #print(ss_hypernyms, "->", ss_hypernyms_indices)

            self.lookupTable2.append(ss_hypernyms_indices)
예제 #39
0
    def parseXML(self, file):
        """ 
		form: 
		(file_name) --> print

		description: 
		function that fills the synset2synonym & synset2word dict

		exemple:
		>>> graphWN.parseXML("wolf-1.0b4.xml")
		XML parsed

		"""
        tree = etree.parse(file)
        synsets = tree.xpath("SYNSET")
        synonyms = []
        for synset in synsets:
            synset_id_orig = synset.findtext("ID")
            synonymsEl = synset.findall("SYNONYM/LITERAL")
            syn = []
            for synonym in synonymsEl:  #obtient une liste des synonymes à partir d'une liste des element SYNONYM
                syn.append(synonym.text)
                #if synonym.text in synonyms:
                #print "IN"
                #else:
                #synonyms.append(synonym.text)
            self.synset2synonym[synset_id_orig] = syn
            #print self.synset2synonym
            synset_id, pos = synset_id_orig.split(
                "-")[2], synset_id_orig.split("-")[3]
            # mapping fr-en b --> r
            if pos == "b":
                pos = "r"
            synset = wn._synset_from_pos_and_offset(pos, int(synset_id))
            synset_name = synset.name.split(".")[0]
            if synset_id_orig not in self.synset2word:
                self.synset2word[synset_id_orig] = synset_name
        print "XML parsed"
예제 #40
0
    def random_le_and_sy(self):
        '''
        '''
        from nltk.corpus import wordnet as wn

        start_at = random.choice(range(len(self.orbn_ids)))

        for counter, le_obj in enumerate(self.les_get_generator()):

            if counter >= start_at:
                print()
                print(etree.tostring(le_obj.le_el, pretty_print=True))
                answer = input('interesting? ')
                if answer == 'y':
                    target = le_obj.get_synset_id()
                    eng, version, offset, pos = target.split('-')
                    sy_obj = self.synsets_find_synset(target)
                    print()
                    print(etree.tostring(sy_obj.synset_el, pretty_print=True))
                    synset = wn._synset_from_pos_and_offset(pos, int(offset))
                    print(synset.lemmas())
                    print(synset.definition())
                    input('continue?')
예제 #41
0
 def loadWNDFile(self, wnd_file):
     print("loading WNDomains file...")
     f = codecs.open(wnd_file, "r", "utf-8-sig")
     lines = f.readlines()
     for line in lines:
         temp = line.strip("\n").split(" ")
         # print(temp)
         pos = temp[0].split("-")[1]
         offset = temp[0].split("-")[0]
         categories = temp[2:]
         try:
             synset = wn._synset_from_pos_and_offset(pos,
                                                     int(offset)).name()
             self.dictS[synset] = categories
             for category in categories:
                 if category not in self.dictC:
                     self.dictC[category] = []
                 if synset not in self.dictC[category]:
                     self.dictC[category].append(synset)
         except:
             continue
     f.close()
     print("    DONE!!")
 def ilidef_to_sensekey(self, ilidef, lemma):
     '''
     given an ilidef and a lemma, this method
     returns the wn30 senseky
     
     @requires: nltk (3.0 was used)
     
     @type  ilidef: str
     @param ilidef: wn30 ilidef (for example "ili-30-05768553-n")
     
     @type  lemma: str
     @param lemma: lemma (for example "dream")
     '''
     ili, version, offset, pos = ilidef.split('-')
     synset = wn._synset_from_pos_and_offset(pos, int(offset))
     sense_keys = [
         synset_lemma.key for synset_lemma in synset.lemmas
         if synset_lemma.key.startswith(lemma + "%")
     ]
     if sense_keys:
         return sense_keys[0]
     else:
         return ""
예제 #43
0
	def parseXML(self, file):
		""" 
		form: 
		(file_name) --> print

		description: 
		function that fills the synset2synonym & synset2word dict

		exemple:
		>>> graphWN.parseXML("wolf-1.0b4.xml")
		XML parsed

		"""
		tree = etree.parse(file)
		synsets=tree.xpath("SYNSET")
		synonyms=[]
		for synset in synsets:
			synset_id_orig=synset.findtext("ID")
			synonymsEl=synset.findall("SYNONYM/LITERAL")
			syn=[]
			for synonym in synonymsEl: #obtient une liste des synonymes à partir d'une liste des element SYNONYM
			  syn.append(synonym.text)
			  #if synonym.text in synonyms:
			    #print "IN"
			  #else:
			    #synonyms.append(synonym.text)
			self.synset2synonym[synset_id_orig]=syn
			#print self.synset2synonym
			synset_id, pos = synset_id_orig.split("-")[2], synset_id_orig.split("-")[3]
  			# mapping fr-en b --> r
  			if pos == "b":
  				pos = "r"
  			synset = wn._synset_from_pos_and_offset(pos,int(synset_id))
  			synset_name = synset.name.split(".")[0]
  			if synset_id_orig not in self.synset2word:
  				self.synset2word[synset_id_orig] = synset_name
  		print "XML parsed"
예제 #44
0
    def parse_src_file(self):
        lines = codecs.open(self.filename, "r", "utf8").read().splitlines()
        lines = filter((lambda x : not re.search(r"^\s*#", x)), lines)
        for i, line in enumerate(lines):
            fields = re.split(r"\t+", line)
            fields = map(str.strip, fields)
            try:            
                #品詞,ID,好感情度,悪感情度,単語,使用例
                pos, offset, pos_score, neg_score, synset_terms, gloss = fields
            except:
                #ここで感情時点の形式が保証される
                sys.stderr.write("Line %s formatted incorrectly: %s\n" % (i, line))

            synset = wn._synset_from_pos_and_offset(pos, int(offset))
            # pos and offsetの意味はわからない。存在の確定?今の所これを満たさないものはなさそう
            if pos and offset:
                offset = int(offset)
                self.db[(pos, offset)] = (float(pos_score), float(neg_score))
            if pos_score and neg_score:
                if (pos_score, neg_score) not in self.score_to_senti_synset:
                    self.score_to_senti_synset[(pos_score, neg_score)] = []
                self.score_to_senti_synset[(pos_score, neg_score)].append(SentiSynset(pos_score,neg_score,synset))
            if synset:
                self.synset_to_score[synset] = (pos_score, neg_score)
예제 #45
0
 def all_senti_synsets(self):
     for key, fields in self.db.iteritems():
         pos, offset = key
         pos_score, neg_score = fields
         synset = wordnet._synset_from_pos_and_offset(pos, offset)
         yield SentiSynset(pos_score, neg_score, synset)
예제 #46
0
def offset_to_synset(offset):
 
    return wn._synset_from_pos_and_offset(str(offset[-1:]), int(offset[:8]))
예제 #47
0
 def id2synset(self, offset):
     x = offset[1:]
     return wn._synset_from_pos_and_offset('n', int(x))
예제 #48
0
def use_wordnet(FreelingFolder, WordnetFolder):
    """
    Call Wordnet using NLTK to get the lexnames.
    Authors: #cf, #uh
    """
    print("use_wordnet...")
    
    if not os.path.exists(WordnetFolder):
        os.makedirs(WordnetFolder)


    InPath = FreelingFolder+"*.xml"
    for File in glob.glob(InPath): 
		
        LexErrCounter = collections.Counter()
		
        with open(File, "r") as InFile: 
            Filename = os.path.basename(File)
            Text = InFile.read()
            Text = re.split(r"\n\s*?</token>", Text)
            NewText = ["<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n<wrapper>"]
            for Line in Text[0:-1]:
                Line = re.sub("token", "w", Line)
                Line = re.sub("sentence", "s", Line)
                # Ids löschen
                Line = re.sub(r'\sid=".*?"', "", Line)
                Line = Line + "</w>"
                #print(Line)
                Word = re.findall("form=\"(.*?)\" ", Line)[0]
                #print(Word)
                Line = re.sub("</w>", Word+"</w>", Line)
                #print(Line) 
                if "wn=" in Line: 
                    #print(Line)
                    SynsetID = re.findall("wn=.*\"", Line)[0]
                    SynsetNumber = int(SynsetID[4:-3])
                    SynsetPOS = SynsetID[-2:-1]
                    #print(SynsetID, SynsetPOS, SynsetNumber)
                    SynsetAbbID = ""
                    try:
                        SynsetAbbID = wn._synset_from_pos_and_offset(SynsetPOS, SynsetNumber)
                    except:
                        ""
                        #print("Error when trying to get synset name.")
                    SynsetAbbID = str(SynsetAbbID)
                    SynsetAbbID = SynsetAbbID[8:-2]
                    #print(SynsetAbbID)
                    Lexname = ""
                    try:
                        Lexname = wn.synset(SynsetAbbID).lexname()
                    except:
                        #print("Error when trying to get lexname.")
                        LexErrCounter.update({"LexNameError":1})
                        Lexname = "xxx"
                    #print(Lexname)
                    Line = re.sub("wn=(.*) >", "wnsyn=\\1 wnlex=\""+Lexname+"\">", Line)
                    #print(Line)
                    NewText.append(Line)
                elif "wn=" not in Line and "<s" not in Line:
                    #print(Line)
                    Line = re.sub(" >", " wnsyn=\"xxx\" wnlex=\"xxx\">", Line)
                    #print(Line)
                    NewText.append(Line)
                elif "<s" in Line:
                    #print(Line)
                    Line = re.sub(" >", " wnsyn=\"xxx\" wnlex=\"xxx\" >", Line)
                    #print(Line)
                    NewText.append(Line)
                
            
            if LexErrCounter["LexNameError"] > 0:
                print(str(LexErrCounter["LexNameError"]) + " lexname(s) could not be found in " + str(Filename))
            NewText.append("</s>\n</wrapper>")                
            NewText = ''.join(NewText)
            with open(WordnetFolder+Filename[:-4]+".xml", "w") as OutFile: 
                OutFile.write(NewText)
                
    print("Done.")
def id2ss(id):
    """Given a WordNet Affect id (e.g. n#05588321) return a synset"""
    return wordnet._synset_from_pos_and_offset(str(id[:1]), int(id[2:]))
예제 #50
0
def get_synset_from_pos_offset(pos,offset):
    word_set = wn._synset_from_pos_and_offset(pos,offset)
    return jsonify({'extended_information' : build_json_for_portion_node_1(word_set)})
예제 #51
0
def readWSD(wsdFile, sentence1, sentence2):

    textFile=open(wsdFile,"r")
    lines = []
    for line in textFile.readlines():
        lines.append(line)
        
    del lines[0]
    #lines look like this: ['ctx_01 w2  02684924-v !! continue\n',.... 'ctx_02 w1  01056411-n !! stop\n', ]
    for l in lines:
        r = l.split()
        if(r[0]=='ctx_01'):
            # will work on sentence1
            senseVal  = []
            root = r[len(r)-1]
            senseVal = r[len(r)-3]
            # this loop gets rid of any precidding 0 from the database location/ synset location
            while(senseVal[0]=="0"):
                senseVal = senseVal[1:]

            for w in sentence1:
                #print w.getValue()
                if(w.getRootValue()==root and isinstance("",type(senseVal))):
                    # splitting '13244109-n'  ---> ['13244109', 'n']
                    senseVal = senseVal.split('-')
                    p = senseVal[len(senseVal)-1]
                    num = int(senseVal[0])

                    if(num!=-1):
                        s = wn._synset_from_pos_and_offset(p, num)
                        
                        s = s.__str__()
                        s = s.split("'")[1] 
                        w.setSynSet(s)
        
        if(r[0]=='ctx_02'):
            root2 = r[len(r)-1]
            senseVal2 = r[len(r)-3]
            #s2SenseVal = r[len(r)-3]
            # this loop gets rid of any precidding 0 from the database location/ synset location
            while(senseVal2[0]=="0"):
                senseVal2 = senseVal2[1:]
            for w in sentence2:
                if(w.getRootValue()==root2 and isinstance("",type(senseVal2))):
                    senseVal2 = senseVal2.split('-')

                    p = senseVal2[len(senseVal2)-1]
                    num = int(senseVal2[0])
                    if(num!=-1):
                        s = wn._synset_from_pos_and_offset(p, num)
                        s = s.__str__()
                        s = s.split("'")[1]
                        w.setSynSet(s)
    SENT1NOUNS = []
    SENT1VERBS = []
    
    SENT2NOUNS = []
    SENT2VERBS = []

    for w1 in sentence1:
        if(w1.getPos()=='NOUN'):
            SENT1NOUNS.append(w1)
        if(w1.getPos()=='VERB'):
            SENT1VERBS.append(w1)
    
    for w2 in sentence2:
        if(w2.getPos()=='NOUN'):
            SENT2NOUNS.append(w2)
        if(w2.getPos()=='VERB'):
            SENT2VERBS.append(w2)

    # Finding the weights for nouns
    # record the max similarity values, later just add them
    maxNounSimilarityValues = []
    NOUNLIST = []
    VERBLIST = []
    # this is for me to see which word from sentence 1 maps to the corresponding words in sentence 2
    nounMapping = {}
    
    for n1 in SENT1NOUNS:
        if(n1.getSynSet()!="null"):
            exp1 = n1.getSynSet() # this is something like this: believe.v.01
            noun1 = wn.synset(exp1)

            maxValue = 0
            # just for printing
            ntemp = ""
            for n2 in SENT2NOUNS:
                if(n1.getSynSet()!="null"):
                    exp2 = n2.getSynSet() # this is something like this: believe.v.01
                    if(exp2!='null'):
                        noun2 = wn.synset(exp2)
                        value = noun1.path_similarity(noun2)
                        if(value > maxValue):
                            maxValue = value

            n1.setWeight(maxValue)
            
        else: # if the synset is null
            n1.setWeight(1)    
    # record the max similarity values, later just add them
    
    maxVerbSimilarityValues = []
    # this is for me to see which word from sentence 1 maps to the corresponding words in sentence 2
    verbMapping = {}

    for v1 in SENT1VERBS:
        if(v1.getSynSet()!="null"):
            #v1.printWord()
            exp1 = v1.getSynSet()
            verb1 = wn.synset(exp1)

            maxValue = 0 
            # just for printing
            vtemp = ""
            for v2 in SENT2VERBS:
                if(v2.getSynSet()!="null"):
                    exp2 = v2.getSynSet()
                    verb2 = wn.synset(exp2)
                    value = verb1.path_similarity(verb2)
                    #print value
                    if(value > maxValue):
                        maxValue = value
                        #maxVerbSimilarityValues.append(maxValue)
                        #verbMapping[v1.getValue()] = v2.getValue()
                        #vtemp = v2.getValue()
                        #matchedVerb2 = v2.copy()
            v1.setWeight(maxValue)
            #matchedVerb2.setWeight(maxValue)
            #wordsFromSentence1.append(v1)
            #VERBLIST.append(v1)
            #VERBLIST.append(matchedVerb2)
        else:
            v1.setWeight(1)
    allWordsfromSentence1 = SENT1NOUNS+SENT1VERBS
    return allWordsfromSentence1
예제 #52
0
 def offset2synset(self, offset):
     '''
     offset2synset('06268567-n')
     Synset('live.v.02')
     '''
     return wn._synset_from_pos_and_offset(str(offset[-1:]), int(offset[:8]))
예제 #53
0
 def offset2synset(self, offset):
     '''
     offset2synset('06268567-n')
     Synset('live.v.02')
     '''
     return wn._synset_from_pos_and_offset(str(offset[-1:]), int(offset[:8]))
예제 #54
0
 def id2synset(self, offset):
     x = offset[1:]
     return wn._synset_from_pos_and_offset('n', int(x))
def get_synset_from_ssid(ssid):
    try:
        return wn._synset_from_pos_and_offset(ssid[-1:], int(ssid[:8]))
    except WordNetError as e:
        print "SSID %s not found" % ssid
        raise e
예제 #56
0
def get_synset_from_POS_offset(pos, offset):
    synset = wn._synset_from_pos_and_offset(pos,offset)
    return synset
예제 #57
0
			print url
			urllib.urlretrieve(url,filename=folder_path +"/url_list.txt");
			counter = 1
			with open(folder_path + "/url_list.txt") as url_list:
					for line in url_list:
								print line
								counter = counter + 1
								urllib.urlretrieve(line,filename=folder_path + "/" + str(counter)+".jpg");


seedfile = open(sys.argv[1] + "/text.seed","a");

counter = 1;
for key,elem in imageNet_Syn_names.items():
						offset = int(key[1:])
						image_syn =  wn._synset_from_pos_and_offset('n',offset)
						folder_name = str(counter) + "_" + key
						desc = image_syn.definition().replace(" ","_")
						seedfile.write(desc)
						folder_name = folder_name + desc
						folder_path = sys.argv[1] + "/" + folder_name
						os.makedirs(folder_path)
						counter = counter + 1
						getImages(key,folder_path)

#print result;
#get the data from ImageNet API
#generate the Mapping with Hyponym Set as well


urllib.urlretrieve("http://image-net.org/archive/words.txt", filename="test.txt")
예제 #58
0
 def all_senti_synsets(self):
     for key, fields in self.db.iteritems():
         pos, offset = key
         pos_score, neg_score = fields
         synset = wn._synset_from_pos_and_offset(pos, offset)
         yield SentiSynset(pos_score, neg_score, synset)
예제 #59
0
파일: imagenet.py 프로젝트: jethrotan/bobo
def category(wnid):
    pos = wnid[0]
    synset = wordnet._synset_from_pos_and_offset(pos, int(str(wnid[1:]).lstrip('0')))  # assume noun
    return str(synset.lemmas[0].name).replace(" ","_")