def synsetFrequency(freqEntry):
  category = freqEntry['category']
  newWordlist = {}
  wordToSynsetMap = {}
  for word in freqEntry['wordlist']:
    if dbsoesvm.wordSynsetMap.find({'category':category, 'word': word}).count():
      synset = dbsoesvm.wordSynsetMap.find({'category':category, 'word': word})[0]['synset'].replace('.','__')
      newWordlist[synset] = newWordlist.get(synset, 0) + freqEntry['wordlist'][word]
    else:
      cnt = Counter({synset: sum(dbsoesvm.wordKfirf.find({'category':category})[0]['wordlist'].get(lemma.name, 0) for lemma in synset.lemmas) for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(category, category))})
      synset = cnt.most_common()[0][0].name.replace('.','__')
      newWordlist[synset] = newWordlist.get(synset, 0) + freqEntry['wordlist'][word]
    wordToSynsetMap[word] = synset.replace('__','.')
  freqEntry['wordlist'] = newWordlist
  print freqEntry
  return freqEntry, wordToSynsetMap, category
Exemplo n.º 2
0
def frequencySynset(db):
  db.synsetFrequency.drop()
  query = {}
  f = open('XXXX', 'w')
  for entry in db.frequency.find(query, timeout = False):
    newWordlist = {}
    for word in entry['wordlist']:
      if db.wordSynsetMap.find({'word': word, 'category': entry['category']}).count():
        synset = db.wordSynsetMap.find({'word': word, 'category': entry['category']})[0]['synset']
        newWordlist[synset.replace('.','__')] = newWordlist.get(synset.replace('.','__'), 0) + entry['wordlist'][word]
      else:
        print 'XXX'
        f.write(word+' '+entry['category']+'\n')
        #because when conducting real test and training. Words in test set not always in train set, so we should assign a synset for it.
        cnt = Counter({synet: sum(db.wordKfirf.find({'category':entry['category']})[0]['wordlist'].get(lemma.name, 0) for lemma in wn.synset(synset).lemmas) for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(entry['category'], entry['category']))})
        synset = cnt.most_common()[0]
        newWordlist[re.sub('\.','__',synset)] = newWordlist.get(re.sub('\.','__',synset), 0) + entry['wordlist'][word]
    entry['wordlist'] = newWordlist
    db.synsetFrequency.insert(entry)