def submit(self, downloading_locker): paras = [] for id in self.venue_ids: paras.append( (id, self.job_id) ) do_multithread_job(instagram_do_work, paras, 10, './log/log_'+str(self.job_id)+'insta.txt') do_multithread_job(foursquare_do_work,paras, 10,'./log/log_'+str(self.job_id)+'_meta.txt') do_multithread_job(foursquare_tips_do_work, paras, 10, './log/log_'+str(self.job_id)+'_tips.txt') do_multithread_job(foursquare_photos_do_work, paras, 10, './log/log_'+str(self.job_id)+'_photos.txt') self.status = 'Finished' downloading_locker.clear() #should export the csv file here meta_file_name = "metadata_user_"+str(self.job_id)+".csv" insta_file_name = "instagram_user_"+str(self.job_id)+".csv" tips_file_name = "tips_user_"+str(self.job_id)+".csv" foursquare_photo_file_name = "foursquare_photo_user_"+str(self.job_id)+".csv" sql = "SELECT * from venue_meta"+str(self.job_id)+""" INTO OUTFILE '/db/www/places-crawl/data/"""+meta_file_name+"""' FIELDS TERMINATED BY ','ENCLOSED BY '"'LINES TERMINATED BY '\n' """ cursor = connect_to_mysql() try: cursor.execute(sql) except Exception as e: print 'OUTPUT TO DB FOLDER ERROR' sql = "SELECT * from venue_photo_instagram"+str(self.job_id)+""" INTO OUTFILE '/db/www/places-crawl/data/"""+insta_file_name+"""' FIELDS TERMINATED BY ','ENCLOSED BY '"'LINES TERMINATED BY '\n' """ try: cursor = connect_to_mysql() cursor.execute(sql) except Exception as e: print "OUTPUT TO INSTAGRAM FOLDER ERROR" sql = "SELECT * from venue_tips"+str(self.job_id)+""" INTO OUTFILE '/db/www/places-crawl/data/"""+tips_file_name+"""' FIELDS TERMINATED BY ','ENCLOSED BY '"'LINES TERMINATED BY '\n' """ try: cursor = connect_to_mysql() cursor.execute(sql) except Exception as e: print "OUTPUT TO FOURSQUARE_TIPS FOLDER ERROR" sql = "SELECT * from venue_photo_4sq"+str(self.job_id)+""" INTO OUTFILE '/db/www/places-crawl/data/"""+foursquare_photo_file_name+"""' FIELDS TERMINATED BY ','ENCLOSED BY '"'LINES TERMINATED BY '\n' """ try: cursor = connect_to_mysql() cursor.execute(sql) except Exception as e: print "OUTPUT TO FOURSQUARE_PHOTO FOLDER ERROR" return "job done"
def main(): foursquare_ids = get_all_foursquare_ids() print foursquare_ids venues = {} cursor = connect_to_mysql() word_doc_freq = {} docs = [] names = [] comments_count = [] for id in foursquare_ids: sql = """select caption from plazas_instaphoto where caption is not NULL and foursquare_venue_id='""" + id + "'" venue_name = foursquare_ids[id] venues[venue_name] = {} cursor.execute(sql) res = cursor.fetchall() doc = "" for r in res: comments = r['caption'] #comments = json.loads(r['caption']) for sentence in comments: #doc += sentence[1] doc += sentence docs.append(doc) names.append(venue_name) comments_count.append(len(res)) t = TFIDF() res = t.compute_tfidf(docs) for i in range(len(res)): print 'Plaza name : ' + names[i].encode('utf-8', 'ignore') print 'Comments for this Plaza : ' + str(comments_count[i]) print res[i]
def export_temporal_data(time_window, start_time, end_time): fetched_results = get_all_stats() ids = get_all_foursquare_ids() venue_dic = {} for r in ids.keys(): venue_dic[r] = [] cur_time = start_time """round time to nearest hour/day/week""" if time_window == 'hour': time_window = timedelta(hours=2) cur_time = cur_time - timedelta( minutes=cur_time.minute, seconds=cur_time.second, microseconds=cur_time.microsecond) + timedelta(hours=1) elif time_window == 'day': time_window = timedelta(days=1) cur_time = cur_time - timedelta( hours=cur_time.hour, minutes=cur_time.minute, seconds=cur_time.second, microseconds=cur_time.microsecond) + timedelta(days=1) elif time_window == 'week': time_window = timedelta(weeks=1) cur_time = cur_time - timedelta( hours=cur_time.hour, minutes=cur_time.minute, seconds=cur_time.second, microseconds=cur_time.microsecond) + timedelta(weeks=1) data_type = 'checkinsCount' time_list = [] time_list.append('Plaza/Time') while cur_time > start_time and cur_time <= end_time: print cur_time time_list.append(cur_time.hour) for id in venue_dic.keys(): sql = "select * from venue_stats where id = '" + id + "' and time between '" + str( cur_time) + "' and '" + str(cur_time + time_window) + "'" cursor = connect_to_mysql() cursor.execute(sql) res = cursor.fetchall() if len(res) != 0: tmp = [val[data_type] for val in res] value = max(tmp) - min(tmp) else: value = 0 venue_dic[id].append(value) cur_time += time_window f = open('./temporal_2hour.csv', 'wt') writer = csv.writer(f, quoting=csv.QUOTE_ALL) writer.writerow(time_list) for key in venue_dic.keys(): venue_dic[key].insert(0, ids[key]) writer.writerow(venue_dic[key]) print data_type print time_list for id in ids.keys(): print ids[id], venue_dic[id]
def main(): foursquare_ids = get_all_foursquare_ids() print foursquare_ids venues = {} cursor = connect_to_mysql() word_doc_freq = {} docs = [] names = [] comments_count = [] for id in foursquare_ids: sql = """select caption from plazas_instaphoto where caption is not NULL and foursquare_venue_id='"""+id + "'" venue_name = foursquare_ids[id] venues[venue_name] = {} cursor.execute(sql) res = cursor.fetchall() doc = "" for r in res: comments = r['caption'] #comments = json.loads(r['caption']) for sentence in comments: #doc += sentence[1] doc += sentence docs.append(doc) names.append( venue_name ) comments_count.append( len(res) ) t = TFIDF() res = t.compute_tfidf(docs) for i in range(len(res) ): print 'Plaza name : '+names[i].encode('utf-8','ignore') print 'Comments for this Plaza : ' + str(comments_count[i]) print res[i]
def main(): sql = """select distinct(foursquare_venue_id) as id from plazas_instaphoto""" cursor = connect_to_mysql() cursor.execute(sql) ids = [] for r in cursor.fetchall(): ids.append(r['id']) do_multithread_job(do_work, ids, 20, './log/dump_pics.log')
def main(): already_in = set() sql = """ select distinct(foursquare_venue_id) as id from plazas_instaphoto """ cursor = connect_to_mysql() cursor.execute(sql) for r in cursor.fetchall(): already_in.add( r['id'] ) sql = "select id from plazas_nyc" cursor = connect_to_mysql() cursor.execute(sql) ids = [] for r in cursor.fetchall(): #if r['id'] not in already_in: ids.append(r['id']) do_multithread_job(do_work, ids, 5, './log/download_instagram.log') print ids
def main(): already_in = set() sql = """ select distinct(foursquare_venue_id) as id from plazas_instaphoto """ cursor = connect_to_mysql() cursor.execute(sql) for r in cursor.fetchall(): already_in.add(r['id']) sql = "select id from plazas_nyc" cursor = connect_to_mysql() cursor.execute(sql) ids = [] for r in cursor.fetchall(): #if r['id'] not in already_in: ids.append(r['id']) do_multithread_job(do_work, ids, 5, './log/download_instagram.log') print ids
def main(): foursquare_ids = get_all_foursquare_ids() print foursquare_ids venues = {} cursor = connect_to_mysql() word_doc_freq = {} global_word_freq = {} for id in foursquare_ids: sql = ( """select comments from venue_photo_instagram where comments is not NULL and foursquare_venue_id='""" + id + "'" ) venue_name = foursquare_ids[id] venues[venue_name] = {} cursor.execute(sql) res = cursor.fetchall() # comments = [] for r in res: comments = json.loads(r["comments"]) for sentence in comments: words = tokenize(sentence[1]) for word in words: if word in global_word_freq: global_word_freq[word] += 1 else: global_word_freq[word] = 1 if word in venues[venue_name]: venues[venue_name][word] += 1 else: venues[venue_name][word] = 1 for w in venues[venue_name].keys(): if w in word_doc_freq: word_doc_freq[w] += 1 else: word_doc_freq[w] = 1 # comments.append( json.loads(r['comments']) ) # print venues[venue_name] too_common_word = [] for word in sorted(global_word_freq.items(), key=lambda tup: tup[1], reverse=True)[:300]: too_common_word.append(word[0]) print too_common_word for venue_name in venues.keys(): word_score = [] words = venues[venue_name] for word in words: score = words[word] / (1 + math.log(word_doc_freq[word])) if word not in too_common_word: word_score.append((word, score, words[word])) print venue_name print sorted(word_score, key=lambda tup: tup[1], reverse=True)[0:20]
def main(): sql = "select id from plazas" cursor = connect_to_mysql() cursor.execute(sql) ids = [] count = 10 for r in cursor.fetchall(): ids.append(r['id']) count-=1; if(count<=0):break do_multithread_job(do_work, ids, 10, 'do_multi.log')
def export_temporal_data(time_window, start_time, end_time): fetched_results = get_all_stats() ids = get_all_foursquare_ids() venue_dic = {} for r in ids.keys(): venue_dic[r] = [] cur_time = start_time """round time to nearest hour/day/week""" if time_window == 'hour': time_window = timedelta(hours=2) cur_time = cur_time - timedelta(minutes=cur_time.minute, seconds=cur_time.second, microseconds = cur_time.microsecond) + timedelta(hours=1) elif time_window == 'day': time_window = timedelta(days=1) cur_time = cur_time - timedelta(hours = cur_time.hour, minutes=cur_time.minute, seconds=cur_time.second, microseconds = cur_time.microsecond) + timedelta(days=1) elif time_window == 'week': time_window = timedelta(weeks=1) cur_time = cur_time - timedelta(hours = cur_time.hour, minutes=cur_time.minute, seconds=cur_time.second, microseconds = cur_time.microsecond)+timedelta(weeks=1) data_type = 'checkinsCount' time_list = [] time_list.append('Plaza/Time') while cur_time > start_time and cur_time <= end_time: print cur_time time_list.append(cur_time.hour) for id in venue_dic.keys(): sql = "select * from venue_stats where id = '" + id + "' and time between '" + str(cur_time) + "' and '" + str(cur_time+time_window) +"'" cursor = connect_to_mysql() cursor.execute(sql) res = cursor.fetchall() if len(res) != 0: tmp = [val[data_type] for val in res ] value = max(tmp) - min(tmp) else: value = 0 venue_dic[id].append( value ) cur_time+= time_window f = open('./temporal_2hour.csv', 'wt') writer = csv.writer(f, quoting=csv.QUOTE_ALL) writer.writerow( time_list) for key in venue_dic.keys(): venue_dic[key].insert(0, ids[key]) writer.writerow( venue_dic[key] ) print data_type print time_list for id in ids.keys(): print ids[id], venue_dic[id]
def main(): foursquare_ids = get_all_foursquare_ids() print foursquare_ids venues = {} cursor = connect_to_mysql() word_doc_freq = {} global_word_freq = {} for id in foursquare_ids: sql = """select comments from plazas_instaphoto where comments is not NULL and foursquare_venue_id='""" + id + "'" venue_name = foursquare_ids[id] venues[venue_name] = {} cursor.execute(sql) res = cursor.fetchall() for r in res: comments = json.loads(r['comments']) for sentence in comments: words = tokenize(sentence[1]) for word in words: if word in global_word_freq: global_word_freq[word] += 1 else: global_word_freq[word] = 1 if word in venues[venue_name]: venues[venue_name][word] += 1 else: venues[venue_name][word] = 1 for w in venues[venue_name].keys(): if w in word_doc_freq: word_doc_freq[w] += 1 else: word_doc_freq[w] = 1 too_common_word = set() for word in sorted(global_word_freq.items(), key=lambda tup: tup[1], reverse=True)[:300]: too_common_word.add(word[0]) for venue_name in venues.keys(): word_score = [] words = venues[venue_name] for word in words: score = words[word] / (1 + math.log(word_doc_freq[word])) if word not in too_common_word: word_score.append((word, score, words[word])) print venue_name print sorted(word_score, key=lambda tup: tup[1], reverse=True)[0:20]
def main(): foursquare_ids = get_all_foursquare_ids() print foursquare_ids venues = {} cursor = connect_to_mysql() word_doc_freq = {} global_word_freq = {} for id in foursquare_ids: sql = """select caption from plazas_instaphoto where caption is not NULL and foursquare_venue_id='"""+id + "'" venue_name = foursquare_ids[id] venues[venue_name] = {} cursor.execute(sql) res = cursor.fetchall() for r in res: comments = [r['caption']] for sentence in comments: words = tokenize(sentence) for word in words: if word in global_word_freq: global_word_freq[word]+=1 else: global_word_freq[word] = 1 if word in venues[venue_name]: venues[venue_name][word] += 1 else: venues[venue_name][word] = 1 for w in venues[venue_name].keys(): if w in word_doc_freq: word_doc_freq[w]+=1 else: word_doc_freq[w] = 1 too_common_word = set() for word in sorted(global_word_freq.items(), key = lambda tup: tup[1], reverse=True)[:300]: too_common_word.add(word[0]) for venue_name in venues.keys(): word_score = [] words = venues[venue_name] for word in words: score = words[word]/(1+math.log(word_doc_freq[word])) if word not in too_common_word: word_score.append( (word, score, words[word]) ) print venue_name.encode('utf-8','ignore') print sorted(word_score, key=lambda tup: tup[1], reverse=True)[0:50]
def main(): sql = "select * from manhattan_venues_meta where checkinsCount > 1000 and tipCount > 20" cursor = connect_to_mysql() cursor.execute(sql) all_venues = cursor.fetchall() ids = [] for r in all_venues: ids.append( r['id'] ) ids = set(ids) crawler = VenuePhotoCrawlerInstagram() cnt = 0 for id in ids: cnt += 1 logging.debug('crawling %d'%(cnt)) if random.random()<0.35: crawler.grab_photos(id, 10)
def main(): sql = "select * from manhattan_venues_meta where checkinsCount > 1000 and tipCount > 20" cursor = connect_to_mysql() cursor.execute(sql) all_venues = cursor.fetchall() ids = [] for r in all_venues: ids.append(r['id']) ids = set(ids) crawler = VenuePhotoCrawlerInstagram() cnt = 0 for id in ids: cnt += 1 logging.debug('crawling %d' % (cnt)) if random.random() < 0.35: crawler.grab_photos(id, 10)
def main(): sql = "select manhattan_venues_meta.id as id, venue_photo_instagram.comments as comment, categoriesName as cate from manhattan_venues_meta, venue_photo_instagram where manhattan_venues_meta.id = venue_photo_instagram.foursquare_venue_id and venue_photo_instagram.comments is not NULL " cursor = connect_to_mysql() cursor.execute(sql) comments = {} counter = {} for r in cursor.fetchall(): if r['id'] in counter: counter[r['id']] += 1 else: counter[r['id']] = 1 if counter[r['id']] > 300: continue cmts = json.loads(r['comment']) cate = r['cate'] if cate not in comments: comments[cate] = "" else: for cmt in cmts: comments[cate] += cmt[1] #print comments print 'doc construction done' cate_names = [] docs = [] for k in comments.keys(): cate_names.append(k) docs.append(comments[k].encode("utf-8", 'ignore')) tf_idf = TFIDF() print 'begin compute' words_list = tf_idf.compute_tfidf(docs, top_n_words=500) for i in range(len(cate_names)): #cate_names[i][1] = cate_names[i][1].encode("utf-8", 'ignore') #cate_names[i] = (cate_names[i][0], cate_names[i][1].encode('utf-8', 'ignore'), cate_names[i][2]) print cate_names[i].encode('utf-8', 'ignore') print words_list[i]
def main(): sql = "select manhattan_venues_meta.id as id, venue_photo_instagram.comments as comment, categoriesName as cate from manhattan_venues_meta, venue_photo_instagram where manhattan_venues_meta.id = venue_photo_instagram.foursquare_venue_id and venue_photo_instagram.comments is not NULL " cursor = connect_to_mysql() cursor.execute(sql) comments = {} counter = {} for r in cursor.fetchall(): if r['id'] in counter: counter[r['id']]+=1 else: counter[r['id']] = 1 if counter[r['id']]>300: continue cmts = json.loads(r['comment']) cate = r['cate'] if cate not in comments: comments[cate] = "" else: for cmt in cmts: comments[cate]+=cmt[1] #print comments print 'doc construction done' cate_names = [] docs = [] for k in comments.keys(): cate_names.append(k) docs.append(comments[k].encode("utf-8",'ignore')) tf_idf = TFIDF() print 'begin compute' words_list = tf_idf.compute_tfidf(docs, top_n_words=500) for i in range(len(cate_names)): #cate_names[i][1] = cate_names[i][1].encode("utf-8", 'ignore') #cate_names[i] = (cate_names[i][0], cate_names[i][1].encode('utf-8', 'ignore'), cate_names[i][2]) print cate_names[i].encode('utf-8', 'ignore') print words_list[i]
def downloader(): sql = "select venue_photo_4sq.url as url, venue_meta.name as name from venue_photo_4sq, venue_meta where venue_photo_4sq.venue_id = venue_meta.id" cursor = connect_to_mysql() cursor.execute(sql) rows = cursor.fetchall() my_cnt = 0 files_dic = {} for r in rows: if r['name'] not in files_dic: files_dic[r['name']] = [] else: files_dic[r['name']].append( r['url'] ) for k in files_dic.keys(): os.mkdir('/Users/eddiexie/work/quan_arch/pics/'+k) pic_count = 0 for _url in files_dic[k]: try: pic_count+=1 print 'working on %s\t %d'%(_url, pic_count) out_path = '/Users/eddiexie/work/quan_arch/pics/'+k+'/'+str(pic_count)+'.jpg' urlretrieve(_url, out_path) except: continue
def do_work(venue_id): sql = "select standard_resolution as url, plazas_meta.name as name from plazas_instaphoto, plazas_meta where plazas_instaphoto.foursquare_venue_id = plazas_meta.id and plazas_instaphoto.foursquare_venue_id = '"+venue_id+"'" cursor = connect_to_mysql() cursor.execute(sql) rows = cursor.fetchall() my_cnt = 0 #for k in files_dic.keys(): k = rows[0]['name'] download_path = '/Users/eddiexie/work/plaza_insta_pics/' if not os.path.exists(download_path+k): os.mkdir(download_path+k) pic_count = 0 for r in rows: _url = r['url'][r['url'].find('http'):].strip() try: pic_count+=1 if(pic_count>=500): break print 'working on %s %s\t %d'%(k,_url, pic_count) out_path = download_path+k+'/'+str(pic_count)+'.jpg' urlretrieve(_url, out_path) except: continue
from lib.mysql_connect import connect_to_mysql lines = open('plazas.csv').readlines() cursor = connect_to_mysql() for line in lines[1:]: venue_id = line.strip() cursor.execute("REPLACE INTO plazas (id) values (%s) ",(venue_id) )
from lib.mysql_connect import connect_to_mysql lines = open('plazas.csv').readlines() cursor = connect_to_mysql() for line in lines[1:]: venue_id = line.strip() cursor.execute("REPLACE INTO plazas (id) values (%s) ", (venue_id))