def get_comparison(news_org, NP_to_sentence, VP_to_sentence, NPs, VPs, NP_synsets, VP_synsets, article_topic, article_headline, article_news_org): '''Compares the articles from a single NewsOrg to an article that is represented by its NPs and VPs.''' # synsets aren't picklable so they're stored as (pos, offset) and unpacked NP_synsets = [wn._synset_from_pos_and_offset(pos, offset) for (pos, offset) in NP_synsets] VP_synsets = [wn._synset_from_pos_and_offset(pos, offset) for (pos, offset) in VP_synsets] comparison_articles = news_org.get_query_results(article_topic) if not comparison_articles: logger.log.warning("No comparison articles for %s" % news_org) return [] comparisons = [] for comparison_article in comparison_articles: if (news_org == article_news_org and comparison_article.headline == article_headline): # comparison_article is likely the same as the original article # do not compare pass try: comparison = compare_articles.compare_articles(NP_to_sentence, VP_to_sentence, NPs, VPs, NP_synsets, VP_synsets, comparison_article) if comparison: comparisons.append(comparison) except: continue return comparisons
def get_comparison(news_org, article_topic, NP_to_sentence, VP_to_sentence, NPs, VPs, NP_synsets, VP_synsets, article): '''Compares the articles from a single NewsOrg to an article that is represented by its NPs and VPs.''' # synsets aren't picklable so they're stored as (pos, offset) and unpacked NP_synsets = [wn._synset_from_pos_and_offset(pos, offset) for (pos, offset) in NP_synsets] VP_synsets = [wn._synset_from_pos_and_offset(pos, offset) for (pos, offset) in VP_synsets] comparison_articles = news_org.get_query_results(article_topic) if not comparison_articles: return [] comparisons = [] for comparison_article in comparison_articles: try: comparison = compare_articles.compare_articles(NP_to_sentence, VP_to_sentence, NPs, VPs, NP_synsets, VP_synsets, comparison_article) if comparison: comparisons.append(comparison) except: continue return comparisons
def loadAll(): if loaded: return index = wn._lemma_pos_offset_map print 'loading wordnet into cache... ' cache = wn._synset_offset_cache f = open('pos_offset.txt','r') for line in f: ll = line.split() pos = ll[0] offset = int(ll[1]) wn._synset_from_pos_and_offset(pos,offset) print 'Done: '+str(sum([len(cache[x]) for x in cache]))+'/'+str(len(index)) loaded = True
def random_le_and_sy(self): ''' ''' from nltk.corpus import wordnet as wn start_at = random.choice( range( len(self.orbn_ids))) for counter,le_obj in enumerate(self.les_get_generator()): if counter >= start_at: print() print(etree.tostring(le_obj.le_el, pretty_print=True)) answer = input('interesting? ') if answer == 'y': target = le_obj.get_synset_id() eng,version,offset,pos = target.split('-') sy_obj = self.synsets_find_synset(target) print() print(etree.tostring(sy_obj.synset_el, pretty_print=True)) synset = wn._synset_from_pos_and_offset(pos,int(offset)) print(synset.lemmas()) print(synset.definition()) input('continue?')
def all_senti_synsets(self): from nltk.corpus import wordnet as wn for key, fields in self._db.items(): pos, offset = key pos_score, neg_score = fields synset = wn._synset_from_pos_and_offset(pos, offset) yield SentiSynset(pos_score, neg_score, synset)
def __getitem__(self, k): for pos in ("n", "v", "a", "r"): try: synset = wn._synset_from_pos_and_offset(pos, k) except: pass if synset: return synset return None
def offset_to_synset(offset): """ Look up a synset given offset-pos >>> synset = offset_to_synset('02614387-v') >>> print '%08d-%s' % (synset.offset, synset.pos) >>> print synset, synset.definition 02614387-v Synset('live.v.02') lead a certain kind of life; live in a certain style """ return wn._synset_from_pos_and_offset(str(offset[-1:]), int(offset[:8]))
def tag_by_synset(self, pos, offset): if not (pos and offset): return None s = wn._synset_from_pos_and_offset(pos, offset) # outputtin verb's paths if pos == 'v': print s.hypernym_paths() return self._tagIt(s);
def get_lemma(synset_id): try: wn_id = synset_id.split(':')[1] offset = int(wn_id[:-1]) pos = wn_id[-1] print offset print pos return wordnet._synset_from_pos_and_offset(pos, offset) except: log.error("get_lemma(): error looking up synset id {0} in NLTK WordNet".format(synset_id)) return None
def synsets_from_txt(self, fname): with open(fname, 'rb') as f: lines = f.readlines() df = [] for line in lines: w = line.split()[0] descr = line.strip('\r\n').replace(w + ' ', '') synset = wn._synset_from_pos_and_offset(w[0], int(w[1:])) df.append({'id': w, 'names': descr, 'synset': synset}) # df = pandas.DataFrame(df, columns=['id', 'names', 'synset']) return df
def offset_to_synset(offset): """ Look up a synset given offset-pos (Thanks for @FBond, see http://moin.delph-in.net/SemCor) >>> synset = offset_to_synset('02614387-v') >>> print '%08d-%s' % (synset.offset, synset.pos) >>> print synset, synset.definition 02614387-v Synset('live.v.02') lead a certain kind of life; live in a certain style """ return wn._synset_from_pos_and_offset(str(offset[-1:]), int(offset[:8]))
def synset(self, id, pos=ADJECTIVE): if pos in _pattern2wordnet: pos = _pattern2wordnet[pos] try: s = wn._synset_from_pos_and_offset(pos, id) lemma = s.lemma_names()[0] return self[lemma] except: pass return None
def synsets_from_txt(self, fname): with open(fname, "rb") as f: lines = f.readlines() df = [] for line in lines: w = line.split()[0] descr = line.strip("\r\n").replace(w + " ", "") synset = wn._synset_from_pos_and_offset(w[0], int(w[1:])) df.append({"id": w, "names": descr, "synset": synset}) # df = pandas.DataFrame(df, columns=['id', 'names', 'synset']) return df
def synsets_from_csv(self, fname): sf = pandas.read_csv(fname, sep='\t') df = [] for idx, row in sf.iterrows(): idd = row['synset_id'] try: synset = wn._synset_from_pos_and_offset(idd[0], int(idd[1:])) except: import pdb; pdb.set_trace() df.append({'id':idd, 'names':row['name'], 'synset':synset}) return df
def pred_acc(self, compute_acc=True): if compute_acc: preds = self.predict() imagenet_labels = self.synsets_from_txt('synset_words.txt') dataset_labels = self.synsets_from_csv( os.path.join(self.exp, 'data', self.exp + '.csv')) all_hyps = lambda s: s.hyponyms() df = pandas.DataFrame.from_dict(dataset_labels) df['imgid'] = '' df['imdnames'] = '' df['kind'] = 'unknown' df['accuracy'] = np.nan df['accuracy0'] = np.nan df['confidence0'] = np.nan for no, dtlab in enumerate(dataset_labels): hypos = set([i for i in dtlab['synset'].closure(all_hyps)]) hypos = hypos.union([dtlab['synset']]) for imglab in imagenet_labels: if imglab['synset'] in hypos: df.loc[no, 'imgid'] = imglab['id'] df.loc[no, 'imgnames'] = imglab['names'] if imglab['id'] == df.loc[no, 'id']: df.loc[no, 'kind'] = 'exact' else: df.loc[no, 'kind'] = 'superordinate' break if compute_acc: acc = False acc0 = False for i, p in enumerate(preds[no]): psyn = wn._synset_from_pos_and_offset( p['synset'][0], int(p['synset'][1:])) df.loc[no, 'pred%d' % i] = ', '.join(psyn.lemma_names()) # check if the prediction is exact # or at least more specific than the correct resp if psyn in hypos: acc = True if i == 0: if psyn in hypos: acc0 = True if acc == False: if df.loc[no, 'kind'] != 'unknown': df.loc[no, 'accuracy'] = False else: df.loc[no, 'accuracy'] = True if acc0 == False: if df.loc[no, 'kind'] != 'unknown': df.loc[no, 'accuracy0'] = False else: df.loc[no, 'accuracy0'] = True df.loc[no, 'confidence0'] = preds[no][0]['confidence'] return df
def synsets_from_csv(self, fname): sf = pandas.read_csv(fname, sep='\t') df = [] for idx, row in sf.iterrows(): idd = row['synset_id'] try: synset = wn._synset_from_pos_and_offset(idd[0], int(idd[1:])) except: import pdb pdb.set_trace() df.append({'id': idd, 'names': row['name'], 'synset': synset}) return df
def get_lemma(synset_id): try: wn_id = synset_id.split(':')[1] offset = int(wn_id[:-1]) pos = wn_id[-1] print offset print pos return wordnet._synset_from_pos_and_offset(pos, offset) except: log.error( "get_lemma(): error looking up synset id {0} in NLTK WordNet". format(synset_id)) return None
def pred_acc(self, compute_acc=True): if compute_acc: preds = self.predict() imagenet_labels = self.synsets_from_txt("synset_words.txt") dataset_labels = self.synsets_from_csv(os.path.join(self.exp, "data", self.exp + ".csv")) all_hyps = lambda s: s.hyponyms() df = pandas.DataFrame.from_dict(dataset_labels) df["imgid"] = "" df["imdnames"] = "" df["kind"] = "unknown" df["accuracy"] = np.nan df["accuracy0"] = np.nan df["confidence0"] = np.nan for no, dtlab in enumerate(dataset_labels): hypos = set([i for i in dtlab["synset"].closure(all_hyps)]) hypos = hypos.union([dtlab["synset"]]) for imglab in imagenet_labels: if imglab["synset"] in hypos: df.loc[no, "imgid"] = imglab["id"] df.loc[no, "imgnames"] = imglab["names"] if imglab["id"] == df.loc[no, "id"]: df.loc[no, "kind"] = "exact" else: df.loc[no, "kind"] = "superordinate" break if compute_acc: acc = False acc0 = False for i, p in enumerate(preds[no]): psyn = wn._synset_from_pos_and_offset(p["synset"][0], int(p["synset"][1:])) df.loc[no, "pred%d" % i] = ", ".join(psyn.lemma_names()) # check if the prediction is exact # or at least more specific than the correct resp if psyn in hypos: acc = True if i == 0: if psyn in hypos: acc0 = True if acc == False: if df.loc[no, "kind"] != "unknown": df.loc[no, "accuracy"] = False else: df.loc[no, "accuracy"] = True if acc0 == False: if df.loc[no, "kind"] != "unknown": df.loc[no, "accuracy0"] = False else: df.loc[no, "accuracy0"] = True df.loc[no, "confidence0"] = preds[no][0]["confidence"] return df
def create_obj_scale_dictionary(obj_names, scales_dic, key_type='synset'): """ Creates a dictionary containing association between objects in the list_aws file and their synset names given A dictionary of desired category<>scale or synset<>scale associations. Category names can be retrieved from the 'shapenet_synset' field on the shapenet meta db. Example of such categories are: 'airplane', 'bag', 'bathtub', 'train' :param obj_names: list containing the object names :param scales_dic: A dictionary containing scale values for particualr class of objects :param key_type: Type of keys in scales_dic. Could be 'synset' (Default) or 'category'. :return: association dictionary with object names as keys """ try: print('Getting synset offsets from MongoDB...') db_client = pm.MongoClient(port=22334) table = db_client['synthetic_generative']['3d_models'] cursor = table.find({'type': 'shapenet', 'version': 2, 'id': {'$in': obj_names}}) if key_type == 'category': obj_scale_dic = dict() # Stores the table for id-scale correspondence for doc in cursor: offset = doc['shapenet_synset'] synset_name = wn._synset_from_pos_and_offset(offset[0], int(offset[1:])).name() basic_name = synset_name.split('.')[0] # Take the first part of the synset name if basic_name in scales_dic: obj_scale_dic["http://threedworld.s3.amazonaws.com/"+doc['id']+'.bundle'] = \ {'option': 'Multi_size', 'scale': scales_dic[basic_name]} else: obj_scale_dic["http://threedworld.s3.amazonaws.com/"+doc['id']+'.bundle'] = \ {'option': 'Multi_size', 'scale': 1} elif key_type == 'synset': obj_scale_dic = dict() # Stores the table for id-scale correspondence for doc in cursor: offset = doc['shapenet_synset'] if offset in scales_dic: obj_scale_dic["http://threedworld.s3.amazonaws.com/" + doc['id'] + '.bundle'] = \ {'option': 'Multi_size', 'scale': scales_dic[offset]} else: obj_scale_dic["http://threedworld.s3.amazonaws.com/" + doc['id'] + '.bundle'] = \ {'option': 'Multi_size', 'scale': 1} else: raise ValueError("Key type should be either: 'synset' or 'category'.") print('Table created!') db_client.close() except: print('Could not connect to DB. Create a SSH tunnel.') raise return obj_scale_dic
def offset2ss(offset_31, wn_31_30): try: # convert wordnet 3.1 id to 3.0 because NLTK limited to 3.0 for now offset_30 = wn31_30[offset_31] synset = wn._synset_from_pos_and_offset(str(offset_30[-1:]), int(offset_30[:8])) # print "offset_31:", offset_31, "\toffset_30:", offset_30, "\tsynset:", synset # print type(wn._synset_from_pos_and_offset(str(offset_30[-1:]), int(offset_30[:8]))) except: # print "key", offset_31, "is not mapped.." #handling the mapping issue between wn3.1 and wn3.0 # print "score given is 0" unmapped_keys.add(offset_31) synset = None return synset
def examine_synset(s): m = re.search(r'^(?:eng-30-)?(\d+)-(\w)', s) if m: ss = wn._synset_from_pos_and_offset(m.group(2), int(m.group(1))) print(ss) if not re.search(r'^eng-30-', s): s = 'eng-30-' + s if s in model.wv.vocab: print('Nearest neighbors:') for word, score in model.wv.most_similar(s): print('\t%s\t%.3f' % (word, score)) else: print('Not found in the vocabulary.') print()
def senti_synset(self, *vals): if tuple(vals) in self.db: pos_score, neg_score = self.db[tuple(vals)] pos, offset = vals synset = wn._synset_from_pos_and_offset(pos, offset) # @UndefinedVariable return SentiSynset(pos_score, neg_score, synset) else: synset = wn.synset(vals[0]) # @UndefinedVariable pos = synset.pos offset = synset.offset if (pos, offset) in self.db: pos_score, neg_score = self.db[(pos, offset)] return SentiSynset(pos_score, neg_score, synset) else: return None
def senti_synset(self, *vals): if tuple(vals) in self.db: pos_score, neg_score = self.db[tuple(vals)] pos, offset = vals synset = wordnet._synset_from_pos_and_offset(pos, offset) return SentiSynset(pos_score, neg_score, synset) else: synset = wordnet.synset(vals[0]) pos = synset.pos offset = synset.offset if (pos, offset) in self.db: pos_score, neg_score = self.db[(pos, offset)] return SentiSynset(pos_score, neg_score, synset) else: return None
def senti_synset(self, *vals): if tuple(vals) in self.db: pos_score, neg_score = self.db[tuple(vals)] pos, offset = vals synset = wn._synset_from_pos_and_offset(pos, offset) return SentiSynset(pos_score, neg_score, synset) else: synset = wn.synset(vals[0]) pos = synset.pos offset = synset.offset if (pos, offset) in self.db: pos_score, neg_score = self.db[(pos, offset)] return SentiSynset(pos_score, neg_score, synset) else: return None
def __init__(self, synset): """ A set of synonyms that share a common meaning. """ if isinstance(synset, WordNetSynset): self._wnsynset = synset elif isinstance(synset, Synset): self = self elif isinstance(synset, (tuple, int)): if isinstance(synset, int): synset = (synset, "NN") offset, pos = synset self._wnsynset = wn._synset_from_pos_and_offset(_pattern2wordnet[pos] if pos in _pattern2wordnet else pos, offset) else: raise NotImplementedError self._synset = _synset
def synsets_from_csv(self, fname, sep=","): with open(fname, "rb") as f: lines = f.readlines() df = [] for line in lines: spl = line.strip("\n").split(sep) try: synset = wn._synset_from_pos_and_offset(spl[0][0], int(spl[0][1:])) except: import pdb pdb.set_trace() df.append({"id": spl[0], "names": spl[1], "synset": synset}) # df = pandas.DataFrame(df, columns=['id', 'names', 'synset']) return df
def synsets_from_csv(self, fname, sep=','): with open(fname, 'rb') as f: lines = f.readlines() df = [] for line in lines: spl = line.strip('\n').split(sep) try: synset = wn._synset_from_pos_and_offset( spl[0][0], int(spl[0][1:])) except: import pdb pdb.set_trace() df.append({'id': spl[0], 'names': spl[1], 'synset': synset}) # df = pandas.DataFrame(df, columns=['id', 'names', 'synset']) return df
def senti_synset(self, *vals): from nltk.corpus import wordnet as wn if tuple(vals) in self._db: pos_score, neg_score = self._db[tuple(vals)] pos, offset = vals synset = wn._synset_from_pos_and_offset(pos, offset) return SentiSynset(pos_score, neg_score, synset) else: synset = wn.synset(vals[0]) pos = synset.pos() offset = synset.offset() if (pos, offset) in self._db: pos_score, neg_score = self._db[(pos, offset)] return SentiSynset(pos_score, neg_score, synset) else: return None
def senti_synset(self, *vals): if tuple(vals) in self.db: print "It is here" pos_score, neg_score = self.db[tuple(vals)] pos, offset = vals synset = wn._synset_from_pos_and_offset(pos, offset) return SentiSynset(pos_score, neg_score, synset) else: print "No it is here." synset = wn.synset(vals[0]) pos = synset.pos offset = synset.offset if (pos, offset) in self.db: pos_score, neg_score = self.db[(pos, offset)] return SentiSynset(pos_score, neg_score, synset) else: return None
def predict(img): img = np.swapaxes(img, 0, 2) img = np.swapaxes(img, 1, 2) img = img[np.newaxis, :] # compute the predict probabilities mod.forward(Batch([mx.nd.array(img)])) prob = mod.get_outputs()[0].asnumpy() # print the top-5 prob = np.squeeze(prob) a = np.argsort(prob)[::-1] for i in a[0:5]: print('index=%d, probability=%f, class=%s' % (i, prob[i], labels[i])) offset = int(labels[i].split(' ')[0].strip('n')) synset = wordnet._synset_from_pos_and_offset('n', offset) paths = synset.hypernym_paths() paths = list(map(lambda x: '/'.join(map(lambda y: y.name(), x)), paths))
def senti_synset(self, *vals): print ">>> @senti_synset" print ">>> vals: " + str(vals) if tuple(vals) in self.db: pos_score, neg_score = self.db[tuple(vals)] pos, offset = vals synset = wn._synset_from_pos_and_offset(pos, offset) return SentiSynset(pos_score, neg_score, synset) else: synset = wn.synset(vals[0]) pos = synset.pos offset = synset.offset if (pos, offset) in self.db: pos_score, neg_score = self.db[(pos, offset)] return SentiSynset(pos_score, neg_score, synset) else: return None
def senti_synset(self, *vals): ''' get pos and neg scores for word input: vals is a word or word, pos tag output: return an object SentiSynset with word, pos score and neg score set in the object ''' if tuple(vals) in self.db: pos_score, neg_score = self.db[tuple(vals)] pos, offset = vals synset = wn._synset_from_pos_and_offset(pos, offset) return SentiSynset(pos_score, neg_score, synset) else: synset = wn.synset(vals[0]) pos = synset.pos offset = synset.offset if (pos, offset) in self.db: pos_score, neg_score = self.db[(pos, offset)] return SentiSynset(pos_score, neg_score, synset) else: return None
def __init__(self, steps=2): with open("data/9k.labels", "r") as _labels: for l in _labels: synset = wn._synset_from_pos_and_offset('n', int(l[1:])) self.original_synsets.append(synset) #print("all size: ", len(synsets)) self.all_hypernyms = self.reduce(self.original_synsets, steps) #print("hypernyms size:", len(self.all_hypernyms)) print("reduced", len(self.original_synsets), "to", len(self.all_hypernyms)) for i in range(len(self.original_synsets)): ss_name = self.original_synsets[i] ss_hypernyms = self.get_hypernyms(ss_name) ss_hypernyms_indices = [] for h in ss_hypernyms: ss_hypernyms_indices.append(self.get_new_index(h)) #print(ss_hypernyms, "->", ss_hypernyms_indices) self.lookupTable2.append(ss_hypernyms_indices)
def parseXML(self, file): """ form: (file_name) --> print description: function that fills the synset2synonym & synset2word dict exemple: >>> graphWN.parseXML("wolf-1.0b4.xml") XML parsed """ tree = etree.parse(file) synsets = tree.xpath("SYNSET") synonyms = [] for synset in synsets: synset_id_orig = synset.findtext("ID") synonymsEl = synset.findall("SYNONYM/LITERAL") syn = [] for synonym in synonymsEl: #obtient une liste des synonymes à partir d'une liste des element SYNONYM syn.append(synonym.text) #if synonym.text in synonyms: #print "IN" #else: #synonyms.append(synonym.text) self.synset2synonym[synset_id_orig] = syn #print self.synset2synonym synset_id, pos = synset_id_orig.split( "-")[2], synset_id_orig.split("-")[3] # mapping fr-en b --> r if pos == "b": pos = "r" synset = wn._synset_from_pos_and_offset(pos, int(synset_id)) synset_name = synset.name.split(".")[0] if synset_id_orig not in self.synset2word: self.synset2word[synset_id_orig] = synset_name print "XML parsed"
def random_le_and_sy(self): ''' ''' from nltk.corpus import wordnet as wn start_at = random.choice(range(len(self.orbn_ids))) for counter, le_obj in enumerate(self.les_get_generator()): if counter >= start_at: print() print(etree.tostring(le_obj.le_el, pretty_print=True)) answer = input('interesting? ') if answer == 'y': target = le_obj.get_synset_id() eng, version, offset, pos = target.split('-') sy_obj = self.synsets_find_synset(target) print() print(etree.tostring(sy_obj.synset_el, pretty_print=True)) synset = wn._synset_from_pos_and_offset(pos, int(offset)) print(synset.lemmas()) print(synset.definition()) input('continue?')
def loadWNDFile(self, wnd_file): print("loading WNDomains file...") f = codecs.open(wnd_file, "r", "utf-8-sig") lines = f.readlines() for line in lines: temp = line.strip("\n").split(" ") # print(temp) pos = temp[0].split("-")[1] offset = temp[0].split("-")[0] categories = temp[2:] try: synset = wn._synset_from_pos_and_offset(pos, int(offset)).name() self.dictS[synset] = categories for category in categories: if category not in self.dictC: self.dictC[category] = [] if synset not in self.dictC[category]: self.dictC[category].append(synset) except: continue f.close() print(" DONE!!")
def ilidef_to_sensekey(self, ilidef, lemma): ''' given an ilidef and a lemma, this method returns the wn30 senseky @requires: nltk (3.0 was used) @type ilidef: str @param ilidef: wn30 ilidef (for example "ili-30-05768553-n") @type lemma: str @param lemma: lemma (for example "dream") ''' ili, version, offset, pos = ilidef.split('-') synset = wn._synset_from_pos_and_offset(pos, int(offset)) sense_keys = [ synset_lemma.key for synset_lemma in synset.lemmas if synset_lemma.key.startswith(lemma + "%") ] if sense_keys: return sense_keys[0] else: return ""
def parseXML(self, file): """ form: (file_name) --> print description: function that fills the synset2synonym & synset2word dict exemple: >>> graphWN.parseXML("wolf-1.0b4.xml") XML parsed """ tree = etree.parse(file) synsets=tree.xpath("SYNSET") synonyms=[] for synset in synsets: synset_id_orig=synset.findtext("ID") synonymsEl=synset.findall("SYNONYM/LITERAL") syn=[] for synonym in synonymsEl: #obtient une liste des synonymes à partir d'une liste des element SYNONYM syn.append(synonym.text) #if synonym.text in synonyms: #print "IN" #else: #synonyms.append(synonym.text) self.synset2synonym[synset_id_orig]=syn #print self.synset2synonym synset_id, pos = synset_id_orig.split("-")[2], synset_id_orig.split("-")[3] # mapping fr-en b --> r if pos == "b": pos = "r" synset = wn._synset_from_pos_and_offset(pos,int(synset_id)) synset_name = synset.name.split(".")[0] if synset_id_orig not in self.synset2word: self.synset2word[synset_id_orig] = synset_name print "XML parsed"
def parse_src_file(self): lines = codecs.open(self.filename, "r", "utf8").read().splitlines() lines = filter((lambda x : not re.search(r"^\s*#", x)), lines) for i, line in enumerate(lines): fields = re.split(r"\t+", line) fields = map(str.strip, fields) try: #品詞,ID,好感情度,悪感情度,単語,使用例 pos, offset, pos_score, neg_score, synset_terms, gloss = fields except: #ここで感情時点の形式が保証される sys.stderr.write("Line %s formatted incorrectly: %s\n" % (i, line)) synset = wn._synset_from_pos_and_offset(pos, int(offset)) # pos and offsetの意味はわからない。存在の確定?今の所これを満たさないものはなさそう if pos and offset: offset = int(offset) self.db[(pos, offset)] = (float(pos_score), float(neg_score)) if pos_score and neg_score: if (pos_score, neg_score) not in self.score_to_senti_synset: self.score_to_senti_synset[(pos_score, neg_score)] = [] self.score_to_senti_synset[(pos_score, neg_score)].append(SentiSynset(pos_score,neg_score,synset)) if synset: self.synset_to_score[synset] = (pos_score, neg_score)
def all_senti_synsets(self): for key, fields in self.db.iteritems(): pos, offset = key pos_score, neg_score = fields synset = wordnet._synset_from_pos_and_offset(pos, offset) yield SentiSynset(pos_score, neg_score, synset)
def offset_to_synset(offset): return wn._synset_from_pos_and_offset(str(offset[-1:]), int(offset[:8]))
def id2synset(self, offset): x = offset[1:] return wn._synset_from_pos_and_offset('n', int(x))
def use_wordnet(FreelingFolder, WordnetFolder): """ Call Wordnet using NLTK to get the lexnames. Authors: #cf, #uh """ print("use_wordnet...") if not os.path.exists(WordnetFolder): os.makedirs(WordnetFolder) InPath = FreelingFolder+"*.xml" for File in glob.glob(InPath): LexErrCounter = collections.Counter() with open(File, "r") as InFile: Filename = os.path.basename(File) Text = InFile.read() Text = re.split(r"\n\s*?</token>", Text) NewText = ["<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n<wrapper>"] for Line in Text[0:-1]: Line = re.sub("token", "w", Line) Line = re.sub("sentence", "s", Line) # Ids löschen Line = re.sub(r'\sid=".*?"', "", Line) Line = Line + "</w>" #print(Line) Word = re.findall("form=\"(.*?)\" ", Line)[0] #print(Word) Line = re.sub("</w>", Word+"</w>", Line) #print(Line) if "wn=" in Line: #print(Line) SynsetID = re.findall("wn=.*\"", Line)[0] SynsetNumber = int(SynsetID[4:-3]) SynsetPOS = SynsetID[-2:-1] #print(SynsetID, SynsetPOS, SynsetNumber) SynsetAbbID = "" try: SynsetAbbID = wn._synset_from_pos_and_offset(SynsetPOS, SynsetNumber) except: "" #print("Error when trying to get synset name.") SynsetAbbID = str(SynsetAbbID) SynsetAbbID = SynsetAbbID[8:-2] #print(SynsetAbbID) Lexname = "" try: Lexname = wn.synset(SynsetAbbID).lexname() except: #print("Error when trying to get lexname.") LexErrCounter.update({"LexNameError":1}) Lexname = "xxx" #print(Lexname) Line = re.sub("wn=(.*) >", "wnsyn=\\1 wnlex=\""+Lexname+"\">", Line) #print(Line) NewText.append(Line) elif "wn=" not in Line and "<s" not in Line: #print(Line) Line = re.sub(" >", " wnsyn=\"xxx\" wnlex=\"xxx\">", Line) #print(Line) NewText.append(Line) elif "<s" in Line: #print(Line) Line = re.sub(" >", " wnsyn=\"xxx\" wnlex=\"xxx\" >", Line) #print(Line) NewText.append(Line) if LexErrCounter["LexNameError"] > 0: print(str(LexErrCounter["LexNameError"]) + " lexname(s) could not be found in " + str(Filename)) NewText.append("</s>\n</wrapper>") NewText = ''.join(NewText) with open(WordnetFolder+Filename[:-4]+".xml", "w") as OutFile: OutFile.write(NewText) print("Done.")
def id2ss(id): """Given a WordNet Affect id (e.g. n#05588321) return a synset""" return wordnet._synset_from_pos_and_offset(str(id[:1]), int(id[2:]))
def get_synset_from_pos_offset(pos,offset): word_set = wn._synset_from_pos_and_offset(pos,offset) return jsonify({'extended_information' : build_json_for_portion_node_1(word_set)})
def readWSD(wsdFile, sentence1, sentence2): textFile=open(wsdFile,"r") lines = [] for line in textFile.readlines(): lines.append(line) del lines[0] #lines look like this: ['ctx_01 w2 02684924-v !! continue\n',.... 'ctx_02 w1 01056411-n !! stop\n', ] for l in lines: r = l.split() if(r[0]=='ctx_01'): # will work on sentence1 senseVal = [] root = r[len(r)-1] senseVal = r[len(r)-3] # this loop gets rid of any precidding 0 from the database location/ synset location while(senseVal[0]=="0"): senseVal = senseVal[1:] for w in sentence1: #print w.getValue() if(w.getRootValue()==root and isinstance("",type(senseVal))): # splitting '13244109-n' ---> ['13244109', 'n'] senseVal = senseVal.split('-') p = senseVal[len(senseVal)-1] num = int(senseVal[0]) if(num!=-1): s = wn._synset_from_pos_and_offset(p, num) s = s.__str__() s = s.split("'")[1] w.setSynSet(s) if(r[0]=='ctx_02'): root2 = r[len(r)-1] senseVal2 = r[len(r)-3] #s2SenseVal = r[len(r)-3] # this loop gets rid of any precidding 0 from the database location/ synset location while(senseVal2[0]=="0"): senseVal2 = senseVal2[1:] for w in sentence2: if(w.getRootValue()==root2 and isinstance("",type(senseVal2))): senseVal2 = senseVal2.split('-') p = senseVal2[len(senseVal2)-1] num = int(senseVal2[0]) if(num!=-1): s = wn._synset_from_pos_and_offset(p, num) s = s.__str__() s = s.split("'")[1] w.setSynSet(s) SENT1NOUNS = [] SENT1VERBS = [] SENT2NOUNS = [] SENT2VERBS = [] for w1 in sentence1: if(w1.getPos()=='NOUN'): SENT1NOUNS.append(w1) if(w1.getPos()=='VERB'): SENT1VERBS.append(w1) for w2 in sentence2: if(w2.getPos()=='NOUN'): SENT2NOUNS.append(w2) if(w2.getPos()=='VERB'): SENT2VERBS.append(w2) # Finding the weights for nouns # record the max similarity values, later just add them maxNounSimilarityValues = [] NOUNLIST = [] VERBLIST = [] # this is for me to see which word from sentence 1 maps to the corresponding words in sentence 2 nounMapping = {} for n1 in SENT1NOUNS: if(n1.getSynSet()!="null"): exp1 = n1.getSynSet() # this is something like this: believe.v.01 noun1 = wn.synset(exp1) maxValue = 0 # just for printing ntemp = "" for n2 in SENT2NOUNS: if(n1.getSynSet()!="null"): exp2 = n2.getSynSet() # this is something like this: believe.v.01 if(exp2!='null'): noun2 = wn.synset(exp2) value = noun1.path_similarity(noun2) if(value > maxValue): maxValue = value n1.setWeight(maxValue) else: # if the synset is null n1.setWeight(1) # record the max similarity values, later just add them maxVerbSimilarityValues = [] # this is for me to see which word from sentence 1 maps to the corresponding words in sentence 2 verbMapping = {} for v1 in SENT1VERBS: if(v1.getSynSet()!="null"): #v1.printWord() exp1 = v1.getSynSet() verb1 = wn.synset(exp1) maxValue = 0 # just for printing vtemp = "" for v2 in SENT2VERBS: if(v2.getSynSet()!="null"): exp2 = v2.getSynSet() verb2 = wn.synset(exp2) value = verb1.path_similarity(verb2) #print value if(value > maxValue): maxValue = value #maxVerbSimilarityValues.append(maxValue) #verbMapping[v1.getValue()] = v2.getValue() #vtemp = v2.getValue() #matchedVerb2 = v2.copy() v1.setWeight(maxValue) #matchedVerb2.setWeight(maxValue) #wordsFromSentence1.append(v1) #VERBLIST.append(v1) #VERBLIST.append(matchedVerb2) else: v1.setWeight(1) allWordsfromSentence1 = SENT1NOUNS+SENT1VERBS return allWordsfromSentence1
def offset2synset(self, offset): ''' offset2synset('06268567-n') Synset('live.v.02') ''' return wn._synset_from_pos_and_offset(str(offset[-1:]), int(offset[:8]))
def get_synset_from_ssid(ssid): try: return wn._synset_from_pos_and_offset(ssid[-1:], int(ssid[:8])) except WordNetError as e: print "SSID %s not found" % ssid raise e
def get_synset_from_POS_offset(pos, offset): synset = wn._synset_from_pos_and_offset(pos,offset) return synset
print url urllib.urlretrieve(url,filename=folder_path +"/url_list.txt"); counter = 1 with open(folder_path + "/url_list.txt") as url_list: for line in url_list: print line counter = counter + 1 urllib.urlretrieve(line,filename=folder_path + "/" + str(counter)+".jpg"); seedfile = open(sys.argv[1] + "/text.seed","a"); counter = 1; for key,elem in imageNet_Syn_names.items(): offset = int(key[1:]) image_syn = wn._synset_from_pos_and_offset('n',offset) folder_name = str(counter) + "_" + key desc = image_syn.definition().replace(" ","_") seedfile.write(desc) folder_name = folder_name + desc folder_path = sys.argv[1] + "/" + folder_name os.makedirs(folder_path) counter = counter + 1 getImages(key,folder_path) #print result; #get the data from ImageNet API #generate the Mapping with Hyponym Set as well urllib.urlretrieve("http://image-net.org/archive/words.txt", filename="test.txt")
def all_senti_synsets(self): for key, fields in self.db.iteritems(): pos, offset = key pos_score, neg_score = fields synset = wn._synset_from_pos_and_offset(pos, offset) yield SentiSynset(pos_score, neg_score, synset)
def category(wnid): pos = wnid[0] synset = wordnet._synset_from_pos_and_offset(pos, int(str(wnid[1:]).lstrip('0'))) # assume noun return str(synset.lemmas[0].name).replace(" ","_")