示例#1
0
    def doClusteringOnUser(self):
        photos = self.getClusteringData()
        all_text = []
        ei = MongoDBInterface()
        ei.setDB('citybeat_production')
        ei.setCollection('photos')
        user_cnt = 0
        for p in photos:
            user_cnt+=1
            if user_cnt%10==0:
                print 'user ', user_cnt
            user_name = p['user']['username']
            user_photos = ei.getAllDocuments( {'user.username':user_name})
            text = ""
            for tp in user_photos:
                try:
                    text += tp['caption']['text']
                except:
                    continue
            all_text.append( text )
        vectorizer = TfidfVectorizer(max_df = 0.1, lowercase = True, sublinear_tf=True, min_df=10, stop_words='english', use_idf=True)
        X = vectorizer.fit_transform(all_text)

        print 'shape = ',X.shape 
        
        algo = KMeans(10) 
        #algo = SpectralClustering(n_clusters=5)
        X = normalize(X)
        algo.fit(X)
        
        f = file(self.file_name_prefix+'text_on_user.csv', 'w')
        
        for idx in range(len(photos)):
            p = photos[idx]
            f.write( (str(p['location']['latitude'])+','+str(p['location']['longitude'])+','+str(algo.labels_[idx])+','+p['images']['standard_resolution']['url'] + '\n' ))
示例#2
0
from photo_interface import PhotoInterface
from caption_parser import CaptionParser
from mongodb_interface import MongoDBInterface
from photo import Photo

import random


if __name__ == '__main__':
	mi = MongoDBInterface()
	mi.setDB('test_caption')
	mi.setCollection('captions')
	
	cp = CaptionParser(True)
	
	i = 0
	captions = mi.getAllDocuments()
	for caption in captions:
		i += 1
		if i % 1000 == 0:
#			print cp.getTopWords(200)
			print i
			print len(cp._)
		cp.insertCaption(caption['caption'])

	for word, value in cp.getTopWords(300):
		print '\''+word+'\',',
	print
示例#3
0
from photo_interface import PhotoInterface
from caption_parser import CaptionParser
from photo import Photo
from mongodb_interface import MongoDBInterface

import random


if __name__ == '__main__':
	pi = PhotoInterface()
	pi.setDB('citybeat')
	pi.setCollection('photos')
	
	mi = MongoDBInterface()
	mi.setDB('test_caption')
	mi.setCollection('captions')
	
	photos = pi.getAllDocuments()
	for photo in photos:
		i = random.randint(0,10)
		if i > 0:
			continue
		p = Photo(photo)
		cap = p.getCaption()
		if len(cap) > 0:
			cap = {'caption':cap}
			mi.saveDocument(cap)
示例#4
0
from photo_interface import PhotoInterface
from caption_parser import CaptionParser
from mongodb_interface import MongoDBInterface
from photo import Photo

import random

if __name__ == '__main__':
    mi = MongoDBInterface()
    mi.setDB('test_caption')
    mi.setCollection('captions')

    cp = CaptionParser(True)

    i = 0
    captions = mi.getAllDocuments()
    for caption in captions:
        i += 1
        if i % 1000 == 0:
            #			print cp.getTopWords(200)
            print i
            print len(cp._)
        cp.insertCaption(caption['caption'])

    for word, value in cp.getTopWords(300):
        print '\'' + word + '\',',
    print
示例#5
0
source_mongodb_port = 27017
source_connection = pymongo.Connection(source_mongodb_address, source_mongodb_port)
source_connection['admin'].authenticate( 'admin', 'mediumdatarules')

target_mongodb_address = 'grande.rutgers.edu'
target_mongodb_port = 27017
target_connection = pymongo.Connection(target_mongodb_address, target_mongodb_port)

print source_connection.database_names()
print source_connection['citybeat_production'].collection_names()
print target_connection['citybeat_production'].collection_names()


for collection in source_connection['citybeat_production'].collection_names():
    print 'start collection: ' + collection
    source_interface = MongoDBInterface()
    source_interface._connection = source_connection
    source_interface.setDB('citybeat_production')
    source_interface.setCollection(collection)

    target_interface = MongoDBInterface()
    target_interface._connection = target_connection
    target_interface.setDB('citybeat_production')
    target_interface.setCollection(collection)

    count = 0
    for e in source_interface.getAllDocuments():
        try:
            target_interface.saveDocument(e)
            count += 1
        except Exception:
示例#6
0
def main():
    ec2 = MongoDBInterface()
    ec2.setDB('test_chaolun')
    ec2.setCollection('test')
    print ec2.getAllDocumentIDs()
示例#7
0
from photo_interface import PhotoInterface
from caption_parser import CaptionParser
from photo import Photo
from mongodb_interface import MongoDBInterface

import random

if __name__ == '__main__':
    pi = PhotoInterface()
    pi.setDB('citybeat')
    pi.setCollection('photos')

    mi = MongoDBInterface()
    mi.setDB('test_caption')
    mi.setCollection('captions')

    photos = pi.getAllDocuments()
    for photo in photos:
        i = random.randint(0, 10)
        if i > 0:
            continue
        p = Photo(photo)
        cap = p.getCaption()
        if len(cap) > 0:
            cap = {'caption': cap}
            mi.saveDocument(cap)