def intersect_vocab(db_dict, tag_file, addl_vocab=[], db_wn='', wn_list=[]): cn_words, cn_vocab = get_conceptnet_words(db_dict, addl_vocab) print "ConceptNet: %d words, %d cleaned" % (len(cn_words), len(cn_vocab) ) fr_words = {} cnt,__ = read_unigram(tag_file, fr_words) fr_words = filter(lambda k: fr_words[k]>5, fr_words.keys()) vocab = list( set(cn_vocab) & set(fr_words) ) vocab = filter(lambda s: len(s)>0, vocab) print "Flickr kep %d/%d tags, %d in common with ConceptNet " % (len(fr_words), cnt, len(vocab) ) print " %d words in the intersected vocab" % len(vocab) # now deal with wordnet #wn_words = get_wordnet_words(db_wn, wn_list, db_dict, vocab) return vocab, fr_words
def intersect_vocab(db_dict, tag_file, addl_vocab=[], db_wn='', wn_list=[]): cn_words, cn_vocab = get_conceptnet_words(db_dict, addl_vocab) print "ConceptNet: %d words, %d cleaned" % (len(cn_words), len(cn_vocab)) fr_words = {} cnt, __ = read_unigram(tag_file, fr_words) fr_words = filter(lambda k: fr_words[k] > 5, fr_words.keys()) vocab = list(set(cn_vocab) & set(fr_words)) vocab = filter(lambda s: len(s) > 0, vocab) print "Flickr kep %d/%d tags, %d in common with ConceptNet " % ( len(fr_words), cnt, len(vocab)) print " %d words in the intersected vocab" % len(vocab) # now deal with wordnet #wn_words = get_wordnet_words(db_wn, wn_list, db_dict, vocab) return vocab, fr_words
def make_vocab(argv): if len(argv)<2: argv = ['-h'] parser = OptionParser(description='construct+compare conceptnet and flickr word similarities') parser.add_option('-d', '--db_dir', dest='db_dir', default="", help='dir containing sqlite db files') parser.add_option("", '--db_dict', dest='db_dict', default="dict.db", help='dictionary') parser.add_option("", '--unigram_file', dest='unigram_file', default="unigram.txt", help='unigrams file %word count%') parser.add_option("", '--wn_list', dest='wn_list', default="wnet-50.txt", help='') parser.add_option("", '--addl_vocab', dest='addl_vocab', default="places_etc.txt", help='') #parser.add_option("", '--db_wordnet', dest='db_wordnet', default="wordnet.db", help='') #parser.add_option("", '--bigram_file', dest='bigram_file', default="bigram_filtered.txt", help='') (opts, __args) = parser.parse_args(sys.argv) # intersect the two dictionaries first db_dict = os.path.join(opts.db_dir, opts.db_dict) ug_file = os.path.join(opts.db_dir, opts.unigram_file) addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab), 'rt').read().split() #db_wn = os.path.join(opts.db_dir, opts.db_wordnet) #wn_list = os.path.join(opts.db_dir, opts.wn_list) #vocab, fr_words = intersect_vocab(db_dict, ug_file, addl_vocab=addl_vocab) cn_words, cn_vocab = get_conceptnet_words(db_dict, addl_vocab) print "ConceptNet: %d words, %d cleaned" % (len(cn_words), len(cn_vocab) ) fr_words = {} cnt,__ = read_unigram(ug_file, fr_words) fr_words = filter(lambda k: fr_words[k]>5, fr_words.keys()) vocab = list( set(cn_vocab) & set(fr_words) ) vocab = filter(lambda s: len(s)>0, vocab) print "Flickr kep %d/%d tags, %d in common with ConceptNet " % (len(fr_words), cnt, len(vocab) ) print " %d words in the intersected vocab" % len(vocab) #open(os.path.join(opts.db_dir, 'vocab.txt'), "wt").write("\n".join(vocab)) fr_words.sort() open(os.path.join(opts.db_dir, 'vocab_flickr.txt'), "wt").write("\n".join(fr_words)) fo = open(os.path.join(opts.db_dir, 'vocab_conceptnet.txt'), "wt") for k, v in cn_words.iteritems(): if v: fo.write("%s\t%s\n" % (k, ",".join(v)) ) fo.close()
def make_vocab(argv): if len(argv) < 2: argv = ['-h'] parser = OptionParser( description='construct+compare conceptnet and flickr word similarities' ) parser.add_option('-d', '--db_dir', dest='db_dir', default="", help='dir containing sqlite db files') parser.add_option("", '--db_dict', dest='db_dict', default="dict.db", help='dictionary') parser.add_option("", '--unigram_file', dest='unigram_file', default="unigram.txt", help='unigrams file %word count%') parser.add_option("", '--wn_list', dest='wn_list', default="wnet-50.txt", help='') parser.add_option("", '--addl_vocab', dest='addl_vocab', default="places_etc.txt", help='') #parser.add_option("", '--db_wordnet', dest='db_wordnet', default="wordnet.db", help='') #parser.add_option("", '--bigram_file', dest='bigram_file', default="bigram_filtered.txt", help='') (opts, __args) = parser.parse_args(sys.argv) # intersect the two dictionaries first db_dict = os.path.join(opts.db_dir, opts.db_dict) ug_file = os.path.join(opts.db_dir, opts.unigram_file) addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab), 'rt').read().split() #db_wn = os.path.join(opts.db_dir, opts.db_wordnet) #wn_list = os.path.join(opts.db_dir, opts.wn_list) #vocab, fr_words = intersect_vocab(db_dict, ug_file, addl_vocab=addl_vocab) cn_words, cn_vocab = get_conceptnet_words(db_dict, addl_vocab) print "ConceptNet: %d words, %d cleaned" % (len(cn_words), len(cn_vocab)) fr_words = {} cnt, __ = read_unigram(ug_file, fr_words) fr_words = filter(lambda k: fr_words[k] > 5, fr_words.keys()) vocab = list(set(cn_vocab) & set(fr_words)) vocab = filter(lambda s: len(s) > 0, vocab) print "Flickr kep %d/%d tags, %d in common with ConceptNet " % ( len(fr_words), cnt, len(vocab)) print " %d words in the intersected vocab" % len(vocab) #open(os.path.join(opts.db_dir, 'vocab.txt'), "wt").write("\n".join(vocab)) fr_words.sort() open(os.path.join(opts.db_dir, 'vocab_flickr.txt'), "wt").write("\n".join(fr_words)) fo = open(os.path.join(opts.db_dir, 'vocab_conceptnet.txt'), "wt") for k, v in cn_words.iteritems(): if v: fo.write("%s\t%s\n" % (k, ",".join(v))) fo.close()