def wordToSynset(db, isInit = False):
  if isInit:
    db.wordSynsetMap.drop()
  else:
    db.wordSynsetMap.remove({'category':'Travel'})
  if isInit:
    query = {}
  else:
    query = {'category':'Travel'}
  for entry in db.freqbyCtgry.find(query):
    synsetWordMap = {}
    for word in entry['wordlist']:
      for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(entry['category'], entry['category'])):
        if not synsetWordMap.has_key(synset.name):
          synsetWordMap[synset.name] = set([word])
        else:
          synsetWordMap[synset.name].add(word)
    synsetKfirfSumMap = Counter({k:sum(db.wordKfirf.find({'category':entry['category']})[0]['wordlist'][word] for word in synsetWordMap[k]) for k in synsetWordMap})
    for pair in synsetKfirfSumMap.most_common():
      mostSynset = pair[0]
      for word in synsetWordMap[mostSynset]:
      #Actually we can only insert <word synset> never inserted before.
        if db.wordSynsetMap.find({'word': word, 'synset': mostSynset, 'category': entry['category']}).count() == 0:
          db.wordSynsetMap.insert({'word': word, 'synset': mostSynset, 'category': entry['category'], 'depth': 100})
      """
def wordToSynset():
  #todo: should use freqinctgry(only train)
  for entry in db.freqbyCtgry.find():
    synsetWordMap = {}
    for word in entry['wordlist']:
      for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(entry['category'], entry['category'])):
        if not synsetWordMap.has_key(synset.name):
          synsetWordMap[synset.name] = set([word])
        else:
          synsetWordMap[synset.name].add(word)
    synsetKfirfSumMap = Counter({k:sum(db.kfirfbyCtgry.find({'category':entry['category']})[0]['wordlist'][word] for word in synsetWordMap[k]) for k in synsetWordMap})
    for pair in synsetKfirfSumMap.most_common():
      mostSynset = pair[0]
      for word in synsetWordMap[mostSynset]:
        db.wordSynsetMap.insert({'word': word, 'synset': mostSynset, 'category': entry['category'], 'depth': 100})
      mostSynsetWordSet = synsetWordMap.pop(mostSynset)
      #the synsetWordMap changed for assignment need, while the synsetKfirfSumMap does not change.
      for synset in synsetWordMap:
        synsetWordMap[synset] = synsetWordMap[synset] - mostSynsetWordSet
def frequencySynset():
  f = open('XXXX','w')
  for entry in dbRepo.frequency.find():
    newWordlist = {}
    for word in entry['wordlist']:
      if word not in keeplist:
        continue
      if dbRepo.wordSynsetMap.find({'word': word, 'category': entry['category']}).count():
        synset = dbRepo.wordSynsetMap.find({'word': word, 'category': entry['category']})[0]['synset']
        newWordlist[re.sub('\.','__',synset)] = newWordlist.get(re.sub('\.','__',synset), 0) + entry['wordlist'][word]
      else:
        f.write(word+' '+entry['category']+'\n')
      #because when conducting real test and training. Words in test set not always in train set, so we should assign a synset for it.
        cnt = Counter({synet: sum(dbRepo.kfirfbyCtgry.find({'category':entry['category']})[0]['wordlist'].get(lemma.name, 0) for lemma in wn.synset(synset).lemmas) for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(entry['category'], entry['category']))})
        synset = cnt.most_common()[0]
        newWordlist[re.sub('\.','__',synset)] = newWordlist.get(re.sub('\.','__',synset), 0) + entry['wordlist'][word]
    entry['wordlist'] = newWordlist
    dbRepo.synsetFrequency.insert(entry)
def synsetFrequency(freqEntry):
  category = freqEntry['category']
  newWordlist = {}
  wordToSynsetMap = {}
  for word in freqEntry['wordlist']:
    if dbsoesvm.wordSynsetMap.find({'category':category, 'word': word}).count():
      synset = dbsoesvm.wordSynsetMap.find({'category':category, 'word': word})[0]['synset'].replace('.','__')
      newWordlist[synset] = newWordlist.get(synset, 0) + freqEntry['wordlist'][word]
    else:
      cnt = Counter({synset: sum(dbsoesvm.wordKfirf.find({'category':category})[0]['wordlist'].get(lemma.name, 0) for lemma in synset.lemmas) for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(category, category))})
      synset = cnt.most_common()[0][0].name.replace('.','__')
      newWordlist[synset] = newWordlist.get(synset, 0) + freqEntry['wordlist'][word]
    wordToSynsetMap[word] = synset.replace('__','.')
  freqEntry['wordlist'] = newWordlist
  print freqEntry
  return freqEntry, wordToSynsetMap, category
Exemplo n.º 5
0
def frequencySynset(db):
  db.synsetFrequency.drop()
  query = {}
  f = open('XXXX', 'w')
  for entry in db.frequency.find(query, timeout = False):
    newWordlist = {}
    for word in entry['wordlist']:
      if db.wordSynsetMap.find({'word': word, 'category': entry['category']}).count():
        synset = db.wordSynsetMap.find({'word': word, 'category': entry['category']})[0]['synset']
        newWordlist[synset.replace('.','__')] = newWordlist.get(synset.replace('.','__'), 0) + entry['wordlist'][word]
      else:
        print 'XXX'
        f.write(word+' '+entry['category']+'\n')
        #because when conducting real test and training. Words in test set not always in train set, so we should assign a synset for it.
        cnt = Counter({synet: sum(db.wordKfirf.find({'category':entry['category']})[0]['wordlist'].get(lemma.name, 0) for lemma in wn.synset(synset).lemmas) for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(entry['category'], entry['category']))})
        synset = cnt.most_common()[0]
        newWordlist[re.sub('\.','__',synset)] = newWordlist.get(re.sub('\.','__',synset), 0) + entry['wordlist'][word]
    entry['wordlist'] = newWordlist
    db.synsetFrequency.insert(entry)