예제 #1
0
파일: job.py 프로젝트: oeddyo/DataBagel
    def submit(self, downloading_locker):
        paras = []
        for id in self.venue_ids:
            paras.append( (id, self.job_id) )
        do_multithread_job(instagram_do_work, paras, 10, './log/log_'+str(self.job_id)+'insta.txt')
        do_multithread_job(foursquare_do_work,paras, 10,'./log/log_'+str(self.job_id)+'_meta.txt')
        do_multithread_job(foursquare_tips_do_work, paras, 10, './log/log_'+str(self.job_id)+'_tips.txt')
        do_multithread_job(foursquare_photos_do_work, paras, 10, './log/log_'+str(self.job_id)+'_photos.txt')

        self.status = 'Finished'
        downloading_locker.clear() 
        #should export the csv file here
        meta_file_name = "metadata_user_"+str(self.job_id)+".csv"
        insta_file_name = "instagram_user_"+str(self.job_id)+".csv"
        tips_file_name = "tips_user_"+str(self.job_id)+".csv"
        foursquare_photo_file_name = "foursquare_photo_user_"+str(self.job_id)+".csv"
        sql = "SELECT * from venue_meta"+str(self.job_id)+"""
        INTO OUTFILE '/db/www/places-crawl/data/"""+meta_file_name+"""'
        FIELDS TERMINATED BY ','ENCLOSED BY '"'LINES TERMINATED BY '\n'
        """
        cursor = connect_to_mysql()
        try:
            cursor.execute(sql)
        except Exception as e:
            print 'OUTPUT TO DB FOLDER ERROR'

        sql = "SELECT * from venue_photo_instagram"+str(self.job_id)+"""
        INTO OUTFILE '/db/www/places-crawl/data/"""+insta_file_name+"""'
        FIELDS TERMINATED BY ','ENCLOSED BY '"'LINES TERMINATED BY '\n'
        """
        try:
            cursor = connect_to_mysql()
            cursor.execute(sql)
        except Exception as e:
            print "OUTPUT TO INSTAGRAM FOLDER ERROR"
        sql = "SELECT * from venue_tips"+str(self.job_id)+"""
        INTO OUTFILE '/db/www/places-crawl/data/"""+tips_file_name+"""'
        FIELDS TERMINATED BY ','ENCLOSED BY '"'LINES TERMINATED BY '\n'
        """
        try:
            cursor = connect_to_mysql()
            cursor.execute(sql)
        except Exception as e:
            print "OUTPUT TO FOURSQUARE_TIPS FOLDER ERROR"
        sql = "SELECT * from venue_photo_4sq"+str(self.job_id)+"""
        INTO OUTFILE '/db/www/places-crawl/data/"""+foursquare_photo_file_name+"""'
        FIELDS TERMINATED BY ','ENCLOSED BY '"'LINES TERMINATED BY '\n'
        """
        try:
            cursor = connect_to_mysql()
            cursor.execute(sql)
        except Exception as e:
            print "OUTPUT TO FOURSQUARE_PHOTO FOLDER ERROR"
        return "job done"
예제 #2
0
def main():
    foursquare_ids = get_all_foursquare_ids()
    print foursquare_ids
    venues = {}
    cursor = connect_to_mysql()
    word_doc_freq = {}
    docs = []
    names = []
    comments_count = []
    for id in foursquare_ids:
        sql = """select caption from plazas_instaphoto where caption is not NULL and foursquare_venue_id='""" + id + "'"
        venue_name = foursquare_ids[id]
        venues[venue_name] = {}

        cursor.execute(sql)
        res = cursor.fetchall()
        doc = ""
        for r in res:
            comments = r['caption']
            #comments = json.loads(r['caption'])
            for sentence in comments:
                #doc += sentence[1]
                doc += sentence
        docs.append(doc)
        names.append(venue_name)
        comments_count.append(len(res))
    t = TFIDF()
    res = t.compute_tfidf(docs)
    for i in range(len(res)):
        print 'Plaza name : ' + names[i].encode('utf-8', 'ignore')
        print 'Comments for this Plaza : ' + str(comments_count[i])
        print res[i]
예제 #3
0
def export_temporal_data(time_window, start_time, end_time):
    fetched_results = get_all_stats()
    ids = get_all_foursquare_ids()
    venue_dic = {}
    for r in ids.keys():
        venue_dic[r] = []

    cur_time = start_time
    """round time to nearest hour/day/week"""
    if time_window == 'hour':
        time_window = timedelta(hours=2)
        cur_time = cur_time - timedelta(
            minutes=cur_time.minute,
            seconds=cur_time.second,
            microseconds=cur_time.microsecond) + timedelta(hours=1)
    elif time_window == 'day':
        time_window = timedelta(days=1)
        cur_time = cur_time - timedelta(
            hours=cur_time.hour,
            minutes=cur_time.minute,
            seconds=cur_time.second,
            microseconds=cur_time.microsecond) + timedelta(days=1)
    elif time_window == 'week':
        time_window = timedelta(weeks=1)
        cur_time = cur_time - timedelta(
            hours=cur_time.hour,
            minutes=cur_time.minute,
            seconds=cur_time.second,
            microseconds=cur_time.microsecond) + timedelta(weeks=1)

    data_type = 'checkinsCount'
    time_list = []
    time_list.append('Plaza/Time')
    while cur_time > start_time and cur_time <= end_time:
        print cur_time
        time_list.append(cur_time.hour)
        for id in venue_dic.keys():
            sql = "select * from venue_stats where id =  '" + id + "' and time between '" + str(
                cur_time) + "' and '" + str(cur_time + time_window) + "'"
            cursor = connect_to_mysql()
            cursor.execute(sql)
            res = cursor.fetchall()
            if len(res) != 0:
                tmp = [val[data_type] for val in res]
                value = max(tmp) - min(tmp)
            else:
                value = 0
            venue_dic[id].append(value)
        cur_time += time_window
    f = open('./temporal_2hour.csv', 'wt')
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    writer.writerow(time_list)

    for key in venue_dic.keys():
        venue_dic[key].insert(0, ids[key])
        writer.writerow(venue_dic[key])
    print data_type
    print time_list
    for id in ids.keys():
        print ids[id], venue_dic[id]
예제 #4
0
def main():
    foursquare_ids = get_all_foursquare_ids()
    print foursquare_ids
    venues = {}
    cursor = connect_to_mysql()
    word_doc_freq = {}
    docs = [] 
    names = []
    comments_count = []
    for id in foursquare_ids:
        sql = """select caption from plazas_instaphoto where caption is not NULL and foursquare_venue_id='"""+id + "'"
        venue_name = foursquare_ids[id]
        venues[venue_name] = {}
        
        cursor.execute(sql)
        res = cursor.fetchall()
        doc = ""
        for r in res:
            comments = r['caption']
            #comments = json.loads(r['caption'])
            for sentence in comments:
                #doc += sentence[1]
                doc += sentence
        docs.append(doc)
        names.append( venue_name )
        comments_count.append( len(res) )
    t = TFIDF()
    res =  t.compute_tfidf(docs)
    for i in range(len(res) ):
        print 'Plaza name : '+names[i].encode('utf-8','ignore')
        print 'Comments for this Plaza : ' + str(comments_count[i])
        print res[i]
예제 #5
0
def main():
    sql = """select distinct(foursquare_venue_id) as id from plazas_instaphoto"""
    cursor = connect_to_mysql()
    cursor.execute(sql)
    ids = []
    for r in cursor.fetchall():
        ids.append(r['id'])
    do_multithread_job(do_work, ids, 20, './log/dump_pics.log')
def main():

    already_in = set()
    sql = """ select distinct(foursquare_venue_id) as id from plazas_instaphoto """
    cursor = connect_to_mysql()
    cursor.execute(sql)
    for r in cursor.fetchall():
        already_in.add( r['id'] )

    sql = "select id from plazas_nyc"
    cursor = connect_to_mysql()
    cursor.execute(sql)
    
    ids = []
    for r in cursor.fetchall():
        #if r['id'] not in already_in:
        ids.append(r['id'])
    do_multithread_job(do_work, ids, 5, './log/download_instagram.log')
    print ids
예제 #7
0
def main():

    already_in = set()
    sql = """ select distinct(foursquare_venue_id) as id from plazas_instaphoto """
    cursor = connect_to_mysql()
    cursor.execute(sql)
    for r in cursor.fetchall():
        already_in.add(r['id'])

    sql = "select id from plazas_nyc"
    cursor = connect_to_mysql()
    cursor.execute(sql)

    ids = []
    for r in cursor.fetchall():
        #if r['id'] not in already_in:
        ids.append(r['id'])
    do_multithread_job(do_work, ids, 5, './log/download_instagram.log')
    print ids
예제 #8
0
def main():
    foursquare_ids = get_all_foursquare_ids()
    print foursquare_ids
    venues = {}
    cursor = connect_to_mysql()
    word_doc_freq = {}
    global_word_freq = {}
    for id in foursquare_ids:
        sql = (
            """select comments from venue_photo_instagram where comments is not NULL and foursquare_venue_id='"""
            + id
            + "'"
        )
        venue_name = foursquare_ids[id]
        venues[venue_name] = {}

        cursor.execute(sql)
        res = cursor.fetchall()
        # comments = []
        for r in res:
            comments = json.loads(r["comments"])
            for sentence in comments:
                words = tokenize(sentence[1])
                for word in words:
                    if word in global_word_freq:
                        global_word_freq[word] += 1
                    else:
                        global_word_freq[word] = 1

                    if word in venues[venue_name]:
                        venues[venue_name][word] += 1
                    else:
                        venues[venue_name][word] = 1
        for w in venues[venue_name].keys():
            if w in word_doc_freq:
                word_doc_freq[w] += 1
            else:
                word_doc_freq[w] = 1
        # comments.append( json.loads(r['comments']) )
        # print venues[venue_name]
    too_common_word = []
    for word in sorted(global_word_freq.items(), key=lambda tup: tup[1], reverse=True)[:300]:
        too_common_word.append(word[0])
    print too_common_word
    for venue_name in venues.keys():
        word_score = []
        words = venues[venue_name]
        for word in words:
            score = words[word] / (1 + math.log(word_doc_freq[word]))
            if word not in too_common_word:
                word_score.append((word, score, words[word]))

        print venue_name
        print sorted(word_score, key=lambda tup: tup[1], reverse=True)[0:20]
예제 #9
0
def main():
    sql = "select id from plazas"
    cursor = connect_to_mysql()
    cursor.execute(sql)
    
    ids = []
    count = 10
    for r in cursor.fetchall():
        ids.append(r['id'])
        count-=1;
        if(count<=0):break
    do_multithread_job(do_work, ids, 10, 'do_multi.log')
예제 #10
0
def export_temporal_data(time_window, start_time, end_time):
    fetched_results = get_all_stats()
    ids = get_all_foursquare_ids()
    venue_dic = {}
    for r in ids.keys():
        venue_dic[r] = [] 
    
    cur_time = start_time
    """round time to nearest hour/day/week"""
    if time_window == 'hour':
        time_window = timedelta(hours=2)
        cur_time = cur_time - timedelta(minutes=cur_time.minute, seconds=cur_time.second, microseconds = cur_time.microsecond) + timedelta(hours=1)
    elif time_window == 'day':
        time_window = timedelta(days=1)
        cur_time = cur_time - timedelta(hours = cur_time.hour, minutes=cur_time.minute, seconds=cur_time.second, microseconds = cur_time.microsecond) + timedelta(days=1)
    elif time_window == 'week':
        time_window = timedelta(weeks=1)
        cur_time = cur_time - timedelta(hours = cur_time.hour, minutes=cur_time.minute, seconds=cur_time.second, microseconds = cur_time.microsecond)+timedelta(weeks=1)
     
    data_type = 'checkinsCount'
    time_list = []
    time_list.append('Plaza/Time')
    while cur_time > start_time and cur_time <= end_time:
        print cur_time
        time_list.append(cur_time.hour)
        for id in venue_dic.keys():
            sql = "select * from venue_stats where id =  '" + id + "' and time between '" + str(cur_time) + "' and '" + str(cur_time+time_window) +"'"
            cursor = connect_to_mysql()
            cursor.execute(sql)
            res = cursor.fetchall()
            if len(res) != 0:
                tmp = [val[data_type] for val in res ]
                value = max(tmp) - min(tmp) 
            else:
                value = 0
            venue_dic[id].append( value )
        cur_time+= time_window
    f = open('./temporal_2hour.csv', 'wt')
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    writer.writerow( time_list)
    
    for key in venue_dic.keys():
        venue_dic[key].insert(0, ids[key])
        writer.writerow( venue_dic[key] )
    print data_type
    print time_list
    for id in ids.keys():
        print ids[id], venue_dic[id]
예제 #11
0
def main():
    foursquare_ids = get_all_foursquare_ids()
    print foursquare_ids
    venues = {}
    cursor = connect_to_mysql()
    word_doc_freq = {}
    global_word_freq = {}
    for id in foursquare_ids:
        sql = """select comments from plazas_instaphoto where comments is not NULL and foursquare_venue_id='""" + id + "'"
        venue_name = foursquare_ids[id]
        venues[venue_name] = {}

        cursor.execute(sql)
        res = cursor.fetchall()
        for r in res:
            comments = json.loads(r['comments'])
            for sentence in comments:
                words = tokenize(sentence[1])
                for word in words:
                    if word in global_word_freq:
                        global_word_freq[word] += 1
                    else:
                        global_word_freq[word] = 1

                    if word in venues[venue_name]:
                        venues[venue_name][word] += 1
                    else:
                        venues[venue_name][word] = 1
        for w in venues[venue_name].keys():
            if w in word_doc_freq:
                word_doc_freq[w] += 1
            else:
                word_doc_freq[w] = 1
    too_common_word = set()
    for word in sorted(global_word_freq.items(),
                       key=lambda tup: tup[1],
                       reverse=True)[:300]:
        too_common_word.add(word[0])
    for venue_name in venues.keys():
        word_score = []
        words = venues[venue_name]
        for word in words:
            score = words[word] / (1 + math.log(word_doc_freq[word]))
            if word not in too_common_word:
                word_score.append((word, score, words[word]))

        print venue_name
        print sorted(word_score, key=lambda tup: tup[1], reverse=True)[0:20]
def main():
    foursquare_ids = get_all_foursquare_ids()
    print foursquare_ids
    venues = {}
    cursor = connect_to_mysql()
    word_doc_freq = {}
    global_word_freq = {}
    for id in foursquare_ids:
        sql = """select caption from plazas_instaphoto where caption is not NULL and foursquare_venue_id='"""+id + "'"
        venue_name = foursquare_ids[id]
        venues[venue_name] = {}

        cursor.execute(sql)
        res = cursor.fetchall()
        for r in res:
            comments = [r['caption']]
            for sentence in comments:
                words = tokenize(sentence)
                for word in words:
                    if word in global_word_freq:
                        global_word_freq[word]+=1
                    else:
                        global_word_freq[word] = 1

                    if word in venues[venue_name]:
                        venues[venue_name][word] += 1
                    else:
                        venues[venue_name][word] = 1
        for w in venues[venue_name].keys():
            if w in word_doc_freq:
                word_doc_freq[w]+=1
            else:
                word_doc_freq[w] = 1
    too_common_word = set()
    for word in sorted(global_word_freq.items(), key = lambda tup: tup[1], reverse=True)[:300]:
        too_common_word.add(word[0])
    for venue_name in venues.keys():
        word_score = []
        words = venues[venue_name]
        for word in words:
            score = words[word]/(1+math.log(word_doc_freq[word]))
            if word not in too_common_word:
                word_score.append( (word, score, words[word]) )
        
        print venue_name.encode('utf-8','ignore')
        print sorted(word_score, key=lambda tup: tup[1], reverse=True)[0:50]
def main():
    sql = "select * from manhattan_venues_meta where checkinsCount > 1000 and tipCount > 20"
    cursor = connect_to_mysql()
    cursor.execute(sql)
    all_venues = cursor.fetchall()

    ids = []
    for r in all_venues:
        ids.append( r['id'] )
    
    ids = set(ids)

    crawler = VenuePhotoCrawlerInstagram()
    
    cnt = 0 
    for id in ids:
        cnt += 1
        logging.debug('crawling %d'%(cnt))
        if random.random()<0.35:
            crawler.grab_photos(id, 10)
예제 #14
0
def main():
    sql = "select * from manhattan_venues_meta where checkinsCount > 1000 and tipCount > 20"
    cursor = connect_to_mysql()
    cursor.execute(sql)
    all_venues = cursor.fetchall()

    ids = []
    for r in all_venues:
        ids.append(r['id'])

    ids = set(ids)

    crawler = VenuePhotoCrawlerInstagram()

    cnt = 0
    for id in ids:
        cnt += 1
        logging.debug('crawling %d' % (cnt))
        if random.random() < 0.35:
            crawler.grab_photos(id, 10)
예제 #15
0
def main():
    sql = "select manhattan_venues_meta.id as id,  venue_photo_instagram.comments as comment, categoriesName as cate from manhattan_venues_meta, venue_photo_instagram where manhattan_venues_meta.id = venue_photo_instagram.foursquare_venue_id and venue_photo_instagram.comments is not NULL  "

    cursor = connect_to_mysql()
    cursor.execute(sql)
    comments = {}
    counter = {}
    for r in cursor.fetchall():
        if r['id'] in counter:
            counter[r['id']] += 1
        else:
            counter[r['id']] = 1
        if counter[r['id']] > 300:
            continue
        cmts = json.loads(r['comment'])
        cate = r['cate']
        if cate not in comments:
            comments[cate] = ""
        else:
            for cmt in cmts:
                comments[cate] += cmt[1]
    #print comments
    print 'doc construction done'
    cate_names = []
    docs = []
    for k in comments.keys():
        cate_names.append(k)
        docs.append(comments[k].encode("utf-8", 'ignore'))
    tf_idf = TFIDF()
    print 'begin compute'
    words_list = tf_idf.compute_tfidf(docs, top_n_words=500)
    for i in range(len(cate_names)):
        #cate_names[i][1] = cate_names[i][1].encode("utf-8", 'ignore')
        #cate_names[i] = (cate_names[i][0], cate_names[i][1].encode('utf-8', 'ignore'), cate_names[i][2])

        print cate_names[i].encode('utf-8', 'ignore')
        print words_list[i]
def main():
    sql = "select manhattan_venues_meta.id as id,  venue_photo_instagram.comments as comment, categoriesName as cate from manhattan_venues_meta, venue_photo_instagram where manhattan_venues_meta.id = venue_photo_instagram.foursquare_venue_id and venue_photo_instagram.comments is not NULL  "
    
    cursor = connect_to_mysql()
    cursor.execute(sql)
    comments = {}
    counter = {}
    for r in cursor.fetchall():
        if r['id'] in counter:
            counter[r['id']]+=1
        else:
            counter[r['id']] = 1
        if counter[r['id']]>300:
            continue
        cmts =  json.loads(r['comment'])
        cate = r['cate']
        if cate not in comments:
            comments[cate] = ""
        else:
            for cmt in cmts:
                comments[cate]+=cmt[1]
    #print comments
    print 'doc construction done'
    cate_names = []
    docs = []
    for k in comments.keys():
        cate_names.append(k)
        docs.append(comments[k].encode("utf-8",'ignore'))
    tf_idf = TFIDF()
    print 'begin compute'
    words_list = tf_idf.compute_tfidf(docs, top_n_words=500)
    for i in range(len(cate_names)):
        #cate_names[i][1] = cate_names[i][1].encode("utf-8", 'ignore')
        #cate_names[i] = (cate_names[i][0], cate_names[i][1].encode('utf-8', 'ignore'), cate_names[i][2])

        print cate_names[i].encode('utf-8', 'ignore')
        print words_list[i]
예제 #17
0
def downloader():
    sql = "select venue_photo_4sq.url as url, venue_meta.name as name from venue_photo_4sq, venue_meta where venue_photo_4sq.venue_id = venue_meta.id"
    cursor =  connect_to_mysql()
    cursor.execute(sql)
    rows = cursor.fetchall()
    my_cnt = 0
    files_dic = {}
    for r in rows:
        if r['name'] not in files_dic:
            files_dic[r['name']] = []
        else:
            files_dic[r['name']].append( r['url'] )
    
    for k in files_dic.keys():
        os.mkdir('/Users/eddiexie/work/quan_arch/pics/'+k)
        pic_count = 0
        for _url in files_dic[k]:
            try:
                pic_count+=1
                print 'working on %s\t %d'%(_url, pic_count)
                out_path = '/Users/eddiexie/work/quan_arch/pics/'+k+'/'+str(pic_count)+'.jpg'
                urlretrieve(_url, out_path)
            except:
                continue
예제 #18
0
def do_work(venue_id):
    sql = "select standard_resolution as url, plazas_meta.name as name from plazas_instaphoto, plazas_meta where plazas_instaphoto.foursquare_venue_id = plazas_meta.id and plazas_instaphoto.foursquare_venue_id = '"+venue_id+"'"
    cursor =  connect_to_mysql()
    cursor.execute(sql)
    rows = cursor.fetchall()
    my_cnt = 0
    
    #for k in files_dic.keys():
    k = rows[0]['name']
    download_path = '/Users/eddiexie/work/plaza_insta_pics/'
    if not os.path.exists(download_path+k):
        os.mkdir(download_path+k)
    pic_count = 0
    for r in rows:
        _url = r['url'][r['url'].find('http'):].strip()
        try:
            pic_count+=1
            if(pic_count>=500):
                break
            print 'working on %s %s\t %d'%(k,_url, pic_count)
            out_path = download_path+k+'/'+str(pic_count)+'.jpg'
            urlretrieve(_url, out_path)
        except:
            continue
from lib.mysql_connect import connect_to_mysql

lines =  open('plazas.csv').readlines()
cursor = connect_to_mysql()
for line in lines[1:]:
    venue_id = line.strip()
    cursor.execute("REPLACE INTO plazas (id) values (%s) ",(venue_id) )


예제 #20
0
from lib.mysql_connect import connect_to_mysql

lines = open('plazas.csv').readlines()
cursor = connect_to_mysql()
for line in lines[1:]:
    venue_id = line.strip()
    cursor.execute("REPLACE INTO plazas (id) values (%s) ", (venue_id))