def main(): db = PwdDb() tg = TagsetConverter() # assumes the pwds are pos-tagged using Brown tags offset = 0 size = 1000000 #output_file = open('../results/semantic/nouns/{0}_{1}.txt'.format(offset, size), 'wb') output_file = open('../results/pos/verbs/all.txt'.format(offset, size), 'wb') while (db.hasNext()): # for i in range(offset,offset+size): words = db.nextPwd() # list of Words for w in words: if w.pos is None : continue wn_pos = tg.brownToWordNet(w.pos) if wn_pos == 'v': output_file.write(str(w.word) + '\n') db.finish() return 0
def main(): db = PwdDb() tc = TagsetConverter() # assumes the pwds are pos-tagged using Brown tags pos_dist = dict() wn_pos_dist = dict() fragments_total = 0 # number of words pos_total = 0 # number of pos-tagged words wn_verbs_total = 0 # number of verbs that are found in wordnet wn_nouns_total = 0 # number of verbs that are found in wordnet while (db.hasNext()): words = db.nextPwd() # list of Words fragments_total += len(words) for w in words: if w.pos is None : continue pos_total += 1 if w.pos in pos_dist : pos_dist[w.pos] += 1 else : pos_dist[w.pos] = 1 wn_pos = tc.brownToWordNet(w.pos) if wn_pos in wn_pos_dist : wn_pos_dist[wn_pos] += 1 else : wn_pos_dist[wn_pos] = 1 if w.synsets is not None: if wn_pos == 'v' : wn_verbs_total += 1 elif wn_pos == 'n' : wn_nouns_total += 1 db.finish() # convert to list of tuples so we can sort it by value pos_dist = pos_dist.items() pos_dist = sorted(pos_dist, key = lambda entry: entry[1], reverse=True) print "Total number of fragments: {}".format(fragments_total) print 'of which {} are POS tagged words ({}%)'.format(pos_total, float(pos_total)*100/fragments_total) print '\nPOS distribution (Brown tagset):\n' for k, v in pos_dist: print "{}\t{}".format(k, v) print '\nPOS distribution (WordNet tagset):\n', wn_pos_dist print '\n{} verbs found in WordNet ({}% of verbs)'.format(wn_verbs_total, float(wn_verbs_total)*100/wn_pos_dist['v']) print '\n{} nouns found in WordNet ({}% of nouns)'.format(wn_nouns_total, float(wn_nouns_total)*100/wn_pos_dist['n']) return 0
def main(): """ Tags the passwords by semantic categories, assuming it's already pos and sentiment-tagged. It doesn't need the sentiment info, but it gets the synset # that was gotten in the sentiment tagging process, to reduce overhead. """ db = PwdDb() tagger = SemanticTagger() tg = TagsetConverter() # assumes the pwds are pos-tagged using Brown tags print "tagging process initialized..." start = time() csv_writer = csv.writer(open("../results/semantic/test.csv","wb"), dialect='excel') # while (db.hasNext()): for i in range(1,100001): words = db.nextPwd() # list of Words for w in words: t = None # if there's a synset for this word if w.synsets is not None: wn_pos = tg.brownToWordNet(w.pos) t = tagger.tag(w.word, wn_pos, w.synsets) else: t = tagger.tag(w.word, w.pos) w.category = t db.saveCategory(w) csv_writer.writerow([i, w.word, w.category, w.senti, w.pos]) db.finish() print "tagging process took " + str(time()-start) + " seconds." return 0;
in wordnet. In addition I wanted to see the impact on the pos-tagging of the passwords and review my mapping brown -> wordnet. Created on 2013-03-12 @author: rafa ''' from database import PwdDb from tagset_conversion import TagsetConverter if __name__ == '__main__': db = PwdDb() tagconverter = TagsetConverter() # tags not convered by wordnet notcovered = dict() while db.hasNext() : p = db.nextPwd() for w in p : if w.pos is not None and tagconverter.brownToWordNet(w.pos) is None : freq = notcovered[w.pos] if w.pos in notcovered else 0 notcovered[w.pos] = freq + 1 db.finish() print notcovered