def rank_tags(argv): if len(argv)<2: argv = ['-h'] parser = OptionParser(description='rank tags of a single image + compose sentences') parser.add_option('-d', '--db_dir', dest='db_dir', default="", help='dir containing sqlite db and other data') parser.add_option('-n', '--num_output', dest='num_output', type="int", default=3, help='number of output images to examine') parser.add_option("", '--db_dict', dest='db_dict', default="dict.db", help='dictionary') parser.add_option("", '--vocab_score', dest='vocab_score', default="flickr_vscore.txt", help='file containing vocabulary count') parser.add_option("", '--tag_file', dest="tag_file", default="demo-data/24.cache", help="") parser.add_option("", '--wn_list', dest='wn_list', default="wnet-50.txt", help='') parser.add_option("", '--addl_vocab', dest='addl_vocab', default="places_etc.txt", help='') (opts, __args) = parser.parse_args(sys.argv) # intersect the two dictionaries first db_dict = os.path.join(opts.db_dir, opts.db_dict) tag_file = os.path.join(opts.db_dir, opts.tag_file) addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab), 'rt').read().split() vocab_lines = open(os.path.join(opts.db_dir, opts.vocab_score), 'rt').read().split("\n") vocab_lines = filter(len, vocab_lines) vocab_score = {} for vl in vocab_lines: t = vl.split() # [word, score, prc] vocab_score[t[0]] = map(float, t[1:]) # gulp all the tags vocab_lines = open(tag_file, 'rt').read().split("\n") vocab_lines = filter(len, vocab_lines) img_tag = {} for vl in vocab_lines: t = vl.split() #print t img_tag[t[0]] = t[1] print "read %d tags, %d images" % ( len(vocab_score), len(img_tag) ) id_list = img_tag.keys() if opts.num_output<0: random.shuffle(id_list) num_output = - opts.num_output id_select = id_list[:num_output*10] elif opts.num_output>1e5: id_select = [str(opts.num_output)] num_output = 1 else: num_output = opts.num_output id_select = id_list[:num_output*10] icnt = 0 api_keys = open(FLICKR_KEY_FILE, 'r').read().split() api_keys = map(lambda s: s.strip(), api_keys) conn = sqlite3.connect(db_dict) conn.text_factory = str cursor = conn.cursor() for cur_id in id_select: ww = img_tag[cur_id].split(",") vv = map(lambda s:norm_tag(s, cursor, addl_vocab), ww) vv = filter(lambda s: len(s), vv) #find the vscore of vv vs = map(lambda v: vocab_score[v], vv) # get flickr picture url #http://farm{farm-id}.staticflickr.com/{server-id}/{id}_{secret}.jpg if 1: cur_key = api_keys [random.randint(0, len(api_keys)-1)] jinfo = cache_flickr_info(cur_id, cur_key, rootdir="") p = jinfo['photo'] imgurl = 'http://farm%s.staticflickr.com/%s/%s_%s.jpg' % (p["farm"], p["server"], p['id'], p['secret']) else: imgurl = "" #print zip(vv, vs) if len(vv) > 5: icnt += 1 # print results print "\nimg: %s" % (imgurl if imgurl else cur_id) vtup = sorted(map(lambda s,t: (s, t[0], t[1]), vv, vs), key=itemgetter(2), reverse=True) outstr = "" for i, t in enumerate(vtup): outstr += "%s (%0.3f,%2.1f%%)\t"%(t[0],t[1],100*t[2]) if (i+1)%3==0: outstr += "\n" print outstr """ print "visual tags: " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]>.9 else "", vv, vs ) ) print "other : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<=.9 and s[1]>=.6 else "", vv, vs ) ) print "non-visual : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<.6 else "", vv, vs ) ) """ print "" else: pass if icnt >= num_output: break conn.close()
def ingest_flickr_info(url_list, conn, argv): parser = OptionParser(description='return co-occurring tag counts for a given list of flickr URLs') parser.add_option('-w', '--wordnet_id', dest='wordnet_id', default='', help='current wnid') parser.add_option('-j', '--json_dir', dest='json_dir', default=FLICKR_XML_DIR, help='dir to cache json metadata of each photo') parser.add_option('-k', '--flickr_key_file', dest='flickr_key_file', default='flickr.key.txt', help='file containing a list of API keys, one per line') """ 'http://static.flickr.com/2088/[id]_94dbc23839.jpg' """ parser.add_option("-p", "--id_pattern", dest="id_pattern", default="[^/]*//.*/[0-9]*/(?P<flickrid>[0-9]*)\_([0-9a-z]*).*", help="regexp to get flickr id") (opts, __args) = parser.parse_args(argv) api_keys = open(opts.flickr_key_file, 'r').read().split() api_keys = map(lambda s: s.strip(), api_keys) id_pattern = re.compile(opts.id_pattern) csr = conn.cursor() icnt = 0 ist_cnt = 0 img_nct = 0 fail_cnt = 0 id_list = [] for cur_u in url_list: try: m = id_pattern.match(cur_u) imgid = m.group('flickrid') except: print "\t err parsing URL" + cur_u continue icnt += 1 cur_key = api_keys [random.randint(0, len(api_keys)-1)] jinfo = cache_flickr_info(imgid, cur_key, rootdir=opts.json_dir) flickrid = int(imgid) id_list.append(flickrid) wnf_exist = -1 csr.execute("SELECT COUNT(*) FROM imagenet_flickr WHERE wnid=? AND flickrid=?", (opts.wordnet_id, flickrid)) for row in csr: wnf_exist = int(row[0]) if 'stat' not in jinfo or not jinfo['stat']=='ok' : fail_cnt += 1 if wnf_exist>0: csr.execute("INSERT INTO imagenet_flickr (wnid, flickrid,status) VALUES (?,?,?)", (opts.wordnet_id, flickrid, 0)) if icnt%100 == 0 or DEBUG>2: tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s %5d/%5d flickr img not found: %s" % (tt, icnt, len(url_list), flickrid) else: if not wnf_exist==1: ist_cnt += 1 csr.execute("INSERT INTO imagenet_flickr (wnid, flickrid,status) VALUES (?,?,?)", (opts.wordnet_id, flickrid, 1)) img_exist = 0 csr.execute("SELECT COUNT(*) FROM flickr_info WHERE flickrid=%d" % flickrid ) for row in csr: img_exist = int(row[0]) """ INSERT OR REPLACE INTO Employee (id,name,role) VALUES ( 1, 'Susan Bar', coalesce((select name from Employee where id = 1),'Benchwarmer') ); """ if not img_exist: img_nct += 1 pinfo = jinfo["photo"] userid = pinfo["owner"]["nsid"] taken_time = pinfo["dates"]["taken"] image_url = cur_u title = pinfo["title"]["_content"] description = pinfo["description"]["_content"] finfo = (flickrid,userid,taken_time,image_url,title,description) csr.execute("INSERT INTO flickr_info VALUES (?,?,?,?,?,?)", finfo) tmp = pinfo["tags"]["tag"] tg = map(lambda s: s["_content"], tmp) if tg: csr.executemany("INSERT INTO flickr_tag (flickrid,tag) VALUES (?,?)", zip([flickrid]*len(tg), tg)) else: print " img # %d already exist: %d" % (icnt, flickrid) if DEBUG>2: print icnt, opts.wordnet_id, flickrid print repr(finfo) print repr( zip([flickrid]*len(tg), tg) ) print "" else: # what case is this? print " entry already exist:" csr.execute("SELECT * FROM imagenet_flickr WHERE wnid=? AND flickrid=?", (opts.wordnet_id, flickrid)) for row in csr: print " " + repr(row) #csr.execute("SELECT * FROM flickr_info WHERE flickrid=%d" % flickrid) #for row in csr: # print row #print flickrid in id_list if icnt%100 == 0: conn.commit() tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s %d/%d urls processed, %d new records, %d new images, %d failed" \ % (tt, icnt, len(url_list), ist_cnt, img_nct, fail_cnt) conn.commit() tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s %d/%d urls processed, %d new records, %d new images, %d failed \n\n" \ % (tt, icnt, len(url_list), ist_cnt, img_nct, fail_cnt) return
def rank_tags(argv): if len(argv) < 2: argv = ['-h'] parser = OptionParser( description='rank tags of a single image + compose sentences') parser.add_option('-d', '--db_dir', dest='db_dir', default="", help='dir containing sqlite db and other data') parser.add_option('-n', '--num_output', dest='num_output', type="int", default=3, help='number of output images to examine') parser.add_option("", '--db_dict', dest='db_dict', default="dict.db", help='dictionary') parser.add_option("", '--vocab_score', dest='vocab_score', default="flickr_vscore.txt", help='file containing vocabulary count') parser.add_option("", '--tag_file', dest="tag_file", default="demo-data/24.cache", help="") parser.add_option("", '--wn_list', dest='wn_list', default="wnet-50.txt", help='') parser.add_option("", '--addl_vocab', dest='addl_vocab', default="places_etc.txt", help='') (opts, __args) = parser.parse_args(sys.argv) # intersect the two dictionaries first db_dict = os.path.join(opts.db_dir, opts.db_dict) tag_file = os.path.join(opts.db_dir, opts.tag_file) addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab), 'rt').read().split() vocab_lines = open(os.path.join(opts.db_dir, opts.vocab_score), 'rt').read().split("\n") vocab_lines = filter(len, vocab_lines) vocab_score = {} for vl in vocab_lines: t = vl.split() # [word, score, prc] vocab_score[t[0]] = map(float, t[1:]) # gulp all the tags vocab_lines = open(tag_file, 'rt').read().split("\n") vocab_lines = filter(len, vocab_lines) img_tag = {} for vl in vocab_lines: t = vl.split() #print t img_tag[t[0]] = t[1] print "read %d tags, %d images" % (len(vocab_score), len(img_tag)) id_list = img_tag.keys() if opts.num_output < 0: random.shuffle(id_list) num_output = -opts.num_output id_select = id_list[:num_output * 10] elif opts.num_output > 1e5: id_select = [str(opts.num_output)] num_output = 1 else: num_output = opts.num_output id_select = id_list[:num_output * 10] icnt = 0 api_keys = open(FLICKR_KEY_FILE, 'r').read().split() api_keys = map(lambda s: s.strip(), api_keys) conn = sqlite3.connect(db_dict) conn.text_factory = str cursor = conn.cursor() for cur_id in id_select: ww = img_tag[cur_id].split(",") vv = map(lambda s: norm_tag(s, cursor, addl_vocab), ww) vv = filter(lambda s: len(s), vv) #find the vscore of vv vs = map(lambda v: vocab_score[v], vv) # get flickr picture url #http://farm{farm-id}.staticflickr.com/{server-id}/{id}_{secret}.jpg if 1: cur_key = api_keys[random.randint(0, len(api_keys) - 1)] jinfo = cache_flickr_info(cur_id, cur_key, rootdir="") p = jinfo['photo'] imgurl = 'http://farm%s.staticflickr.com/%s/%s_%s.jpg' % ( p["farm"], p["server"], p['id'], p['secret']) else: imgurl = "" #print zip(vv, vs) if len(vv) > 5: icnt += 1 # print results print "\nimg: %s" % (imgurl if imgurl else cur_id) vtup = sorted(map(lambda s, t: (s, t[0], t[1]), vv, vs), key=itemgetter(2), reverse=True) outstr = "" for i, t in enumerate(vtup): outstr += "%s (%0.3f,%2.1f%%)\t" % (t[0], t[1], 100 * t[2]) if (i + 1) % 3 == 0: outstr += "\n" print outstr """ print "visual tags: " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]>.9 else "", vv, vs ) ) print "other : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<=.9 and s[1]>=.6 else "", vv, vs ) ) print "non-visual : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<.6 else "", vv, vs ) ) """ print "" else: pass if icnt >= num_output: break conn.close()
def get_wnet_tags(wn, wtag_file, opts, img_usr=None, force_reload=False): """ collect each image id, usr id, and tags for the input synset """ wn_file = os.path.join(opts.data_home, opts.wnet_list_dir, wn + '.txt') img_list_from_file = [] for cl in open(wn_file, "rt"): if not cl: continue tmp = cl.strip().split() if not tmp or len(tmp) < 2: continue img_list_from_file.append(tmp[1]) img_list_from_file.sort() numim = len(img_list_from_file) imgid_list = [] usr_list = [] tag_dict = {} if os.path.isfile(wtag_file) and not force_reload: for cl in codecs.open(wtag_file, encoding='utf-8', mode="r"): tt = cl.strip().split() imgid_list.append(tt[0]) usr_list.append(tt[1]) tag_dict[tt[0]] = tt[2].split(",") print " read %d image entries from %s " % (len(imgid_list), wtag_file) else: #pass #imgid_list = [] #if numim > len(imgid_list): imgid_list = img_list_from_file if not img_usr: fn = os.path.join(opts.data_home, opts.db_dir, opts.usr_file) fh = gzip.open(fn, "rb") if os.path.splitext(fn)[1] == ".gz" else open( fn, "r") lines = filter(len, fh.read().split("\n")) # skip empty line if any lines = [cl.strip().split() for cl in lines] img_usr = dict(lines) fh.close() tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s read %d usr ids from %s" % (tt, len(img_usr), opts.usr_file) usr_list = map(lambda s: img_usr[s] if s in img_usr else "unk", imgid_list) cur_cache_id = [] empty_cnt = 0 del_id = [] tag_dict = {} fh = codecs.open(wtag_file, encoding='utf-8', mode="w") for (imgid, uid, ii) in zip(imgid_list, usr_list, range(len(usr_list))): if opts.use_json: jinfo = cache_flickr_info(imgid, "", rootdir=os.path.join( opts.data_home, opts.json_dir)) if not jinfo or 'stat' not in jinfo or not jinfo[ 'stat'] == 'ok': pass # no valid metadata else: pinfo = jinfo["photo"] usr = pinfo["owner"]["nsid"] tt = pinfo["tags"]["tag"] ww = map(lambda s: s["_content"], tt) if not uid == usr: #print " user info mismatch. (%s): %s (file) vs %s (json)" % (imgid, uid, usr) uid = usr else: # use tag cache if not cur_cache_id == imgid[:2]: # read new cache file cur_cache_id = imgid[:2] cur_cache_file = os.path.join(opts.data_home, opts.tag_cache_dir, cur_cache_id + ".cache") cache_dict = read_tag_cache(cur_cache_file) #print "\t read %s imgs from %s" % (len(cache_dict), cur_cache_id+".cache") else: pass if imgid in cache_dict: ww = cache_dict[imgid].split(",") else: # img has no tag ww = [] empty_cnt += 1 #print "%s\t%s\t%s" % (imgid, uid, ",".join(ww)) if ww: tag_dict[imgid] = ww fh.write("%s\t%s\t%s\n" % (imgid, uid, ",".join(ww))) else: del_id.append(ii) fh.close() del_id.sort(reverse=True) #pop bigger indexes first, trouble otherwise for ii in del_id: imgid_list.pop(ii) usr_list.pop(ii) tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print '%s wnet "%s" - %d imgs, %d with tags, %d unique users' % \ (tt, wn, numim, len(imgid_list), len(set(usr_list)) ) return imgid_list, usr_list, tag_dict
def get_wnet_tags(wn, wtag_file, opts, img_usr=None, force_reload=False): """ collect each image id, usr id, and tags for the input synset """ wn_file = os.path.join(opts.data_home, opts.wnet_list_dir, wn+'.txt') img_list_from_file = [] for cl in open(wn_file, "rt"): if not cl: continue tmp = cl.strip().split() if not tmp or len(tmp)<2 : continue img_list_from_file.append(tmp[1]) img_list_from_file.sort() numim = len(img_list_from_file) imgid_list = [] usr_list= [] tag_dict = {} if os.path.isfile(wtag_file) and not force_reload: for cl in codecs.open(wtag_file, encoding='utf-8', mode="r"): tt = cl.strip().split() imgid_list.append(tt[0]) usr_list.append(tt[1]) tag_dict[tt[0]] = tt[2].split(",") print " read %d image entries from %s " % (len(imgid_list), wtag_file) else: #pass #imgid_list = [] #if numim > len(imgid_list): imgid_list = img_list_from_file if not img_usr: fn = os.path.join(opts.data_home, opts.db_dir, opts.usr_file) fh = gzip.open(fn, "rb") if os.path.splitext(fn)[1]==".gz" else open(fn, "r") lines = filter(len, fh.read().split("\n") ) # skip empty line if any lines = [cl.strip().split() for cl in lines] img_usr = dict(lines) fh.close() tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s read %d usr ids from %s" % (tt, len(img_usr), opts.usr_file) usr_list = map(lambda s: img_usr[s] if s in img_usr else "unk", imgid_list) cur_cache_id = [] empty_cnt = 0 del_id = [] tag_dict = {} fh = codecs.open(wtag_file, encoding='utf-8', mode="w") for (imgid, uid, ii) in zip(imgid_list, usr_list, range(len(usr_list))): if opts.use_json: jinfo = cache_flickr_info(imgid, "", rootdir=os.path.join(opts.data_home, opts.json_dir) ) if not jinfo or 'stat' not in jinfo or not jinfo['stat']=='ok' : pass # no valid metadata else: pinfo = jinfo["photo"] usr = pinfo["owner"]["nsid"] tt = pinfo["tags"]["tag"] ww = map(lambda s:s["_content"], tt) if not uid == usr: #print " user info mismatch. (%s): %s (file) vs %s (json)" % (imgid, uid, usr) uid = usr else: # use tag cache if not cur_cache_id==imgid[:2]: # read new cache file cur_cache_id = imgid[:2] cur_cache_file = os.path.join(opts.data_home, opts.tag_cache_dir, cur_cache_id+".cache") cache_dict = read_tag_cache(cur_cache_file) #print "\t read %s imgs from %s" % (len(cache_dict), cur_cache_id+".cache") else: pass if imgid in cache_dict: ww = cache_dict[imgid].split(",") else: # img has no tag ww = [] empty_cnt += 1 #print "%s\t%s\t%s" % (imgid, uid, ",".join(ww)) if ww: tag_dict[imgid] = ww fh.write( "%s\t%s\t%s\n" % (imgid, uid, ",".join(ww)) ) else: del_id.append(ii) fh.close() del_id.sort(reverse=True) #pop bigger indexes first, trouble otherwise for ii in del_id: imgid_list.pop(ii) usr_list.pop(ii) tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print '%s wnet "%s" - %d imgs, %d with tags, %d unique users' % \ (tt, wn, numim, len(imgid_list), len(set(usr_list)) ) return imgid_list, usr_list, tag_dict
def download_sbu_imgs(url_file, id_file, img_root_dir, startnum=0, endnum=50, hash_level=2, chars_per_hash=2): ss = os.sep exist_cnt = 0 err_cnt = 0 good_cnt = 0 cnt = 0 url_lines = open(url_file, 'rt').read().split("\n") id_lines = open(id_file, 'rt').read().split("\n") api_keys = open(FLICKR_KEY_FILE, 'r').read().split() api_keys = map(lambda s: s.strip(), api_keys) json_dir = os.path.join(os.path.split(img_root_dir)[0], 'sbu-json') tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s processing #%d - #%d of %d urls" % (tt, startnum, endnum, len(url_lines)) ii = startnum while ii < endnum and ii < len(url_lines): #for (ul, imid) in enumerate(url_lines, id_lines): imgid = id_lines[ii] imgurl = url_lines[ii] cnt += 1 curs = 0 cure = chars_per_hash hdir = [] for i in range(hash_level): curs += i*chars_per_hash cure = curs + chars_per_hash hdir.append(imgid[curs:cure]) outdir = os.path.join(img_root_dir, ss.join(hdir)) imfile = os.path.join(outdir, imgid+".jpg") cur_key = api_keys [random.randint(0, len(api_keys)-1)] _jinfo = cache_flickr_info(imgid, cur_key, rootdir=json_dir) #if 'stat' not in jinfo or not jinfo['stat']=='ok' : # err_cnt += 1 # continue if not os.path.exists(imfile): if not os.path.exists(outdir): os.makedirs(outdir) try: buf = urllib2.urlopen(imgurl).read() fh = open(imfile, 'wb') fh.write(buf) fh.close() good_cnt += 1 except: print " ERR downloading url #%d, img %s from %s" % (ii, imgid, imgurl) err_cnt += 1 else: exist_cnt += 1 ii += 1 if cnt % 100 == 0: tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s processed %d urls: %d new, %d exist, %d err \n\t" % (tt, cnt, good_cnt, exist_cnt, err_cnt) tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s processed %d urls: %d new, %d exist, %d err \n\t from %s" % (tt, cnt, good_cnt, exist_cnt, err_cnt, url_file)
def compile_flickr_info(url_list, argv): parser = OptionParser(description='return co-occurring tag counts for a given list of flickr URLs') parser.add_option('-o', '--out_dir', dest='out_dir', default=WNET_OUT_DIR, help='output dir for wordnet-flickr log file') parser.add_option('-w', '--wordnet_id', dest='wordnet_id', default='', help='current wnid') parser.add_option('-j', '--json_dir', dest='json_dir', default=FLICKR_XML_DIR, help='dir to cache json metadata of each photo') parser.add_option('-k', '--flickr_key_file', dest='flickr_key_file', default='flickr.key.txt', help='file containing a list of API keys, one per line') """ 'http://static.flickr.com/2088/[id]_94dbc23839.jpg' """ parser.add_option("-p", "--id_pattern", dest="id_pattern", default="[^/]*//.*/[0-9]*/(?P<flickrid>[0-9]*)\_([0-9a-z]*).*", help="regexp to get flickr id") (opts, __args) = parser.parse_args(argv) api_keys = open(opts.flickr_key_file, 'r').read().split() api_keys = map(lambda s: s.strip(), api_keys) id_pattern = re.compile(opts.id_pattern) out_file = os.path.join(opts.out_dir, opts.wordnet_id+".txt") if os.path.exists(out_file): tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s output '%s' already exist, RETURN \n\n" % (tt, out_file) return icnt = 0 good_cnt = 0 fail_cnt = 0 dup_cnt = 0 id_url = {} for cur_u in url_list: icnt += 1 try: m = id_pattern.match(cur_u) imgid = m.group('flickrid') except: print "\t err parsing URL" + cur_u # assume url already contains flickr.com/ continue flickrid = int(imgid) if flickrid in id_url: continue dup_cnt += 1 else: cur_key = api_keys [random.randint(0, len(api_keys)-1)] jinfo = cache_flickr_info(imgid, cur_key, rootdir=opts.json_dir) if 'stat' not in jinfo or not jinfo['stat']=='ok' : fail_cnt += 1 #if icnt%100 == 0 or DEBUG>2: # tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') # print "%s %5d/%5d flickr img not found: %s" % (tt, icnt, len(url_list), flickrid) else: good_cnt += 1 id_url[flickrid] = cur_u if icnt%200 == 0: tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s %d/%d urls processed, %d good records, %d dups, %d failed" \ % (tt, icnt, len(url_list), good_cnt, dup_cnt, fail_cnt) # write out the resulting tuples if not os.path.exists(out_file): fh = open(out_file, 'wt') for ii, uu in id_url.iteritems(): fh.write("%s\t%11d\t%s\n" % (opts.wordnet_id, ii, uu) ) fh.close() tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s %d urls processed, %d good records, %d dups, %d failed \n\n" \ % (tt, len(url_list), good_cnt, dup_cnt, fail_cnt) else: tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s %d urls processed, SKIP existing output file %s \n\n" \ % (tt, len(url_list), out_file) return