Пример #1
0
	def __init__(self, host='localhost', port=27017):
		self.host = host
		self.port = port
		self.client = MongoClient(host, port)
		self.yt_validator = YoutubeValidator(self.client)

		if current_env == 'dev':
			self.current_db = self.client.revolvr_development
		elif current_env == 'prod':
			self.current_db = self.client.revolvr
Пример #2
0
class MongoConnector:
	
	#Store the client, current database and current collection
	def __init__(self, host='localhost', port=27017):
		self.host = host
		self.port = port
		self.client = MongoClient(host, port)
		self.yt_validator = YoutubeValidator(self.client)

		if current_env == 'dev':
			self.current_db = self.client.revolvr_development
		elif current_env == 'prod':
			self.current_db = self.client.revolvr

	def insert_unassigned_media(self, all_media):
		for source in all_media:
			for item in source[1]:
				if 'youtube' in item['link']:
					if self.yt_validator.check_media(item['link']):
						self.process_media_item(item)
				else:
					self.process_media_item(item)

	def insert_media(self, username, all_media=None):

		if all_media is not None:
			for source in all_media:
				#For each media source, grab the media items
				for item in source[1]:
					if 'youtube' in item['link']:
						if self.yt_validator.check_media(item['link']):
							media_id = self.process_media_item(item)
							self.process_user_media_item(media_id, username)
							self.process_tag_item(media_id, username)
					else:
							media_id = self.process_media_item(item)
							self.process_user_media_item(media_id, username)
							self.process_tag_item(media_id, username)

	def process_media_item(self, item):

		media_collection = self.current_db.media

		#Check whether we already have the item
		media_id = self._get_media_id(item)

		#If we have no record of this, insert it
		if media_id is None:
			media_id = media_collection.insert(item)

		return media_id

	def process_user_media_item(self, media_id, username):

		user_media_collection = self.current_db.user_media

		#Now grab said item (either new or existing) from the database
		user_media_item = user_media_collection.find_one({'_id': ObjectId(media_id) })

		#If this media has not previously been associated with a user, store it
		if user_media_item is None:
			user_media_collection.insert({u'_id' : media_id, 'user_ratings' : [{'user' : username, 'rating' : randint(1, 100)}]})
		
		#Else if a user has already been associated with this media item;
		#If the current user is in the stored users, ignore
		#Else store it alongside previous users
		else:

			user_ratings = user_media_item[u'user_ratings']

			for user_rating in user_ratings:
				existing_user = False
				if user_rating[u'user'] == username:
					existing_user = True
					break

			if not existing_user:
				user_ratings.append({'user' : username, 'rating' : randint(1, 100)})
				user_media_collection.update({u'_id' : ObjectId(media_id)}, {"$set": {'user_ratings' : user_ratings}}, upsert=True)


	def process_tag_item(self, media_id, username):

		#Store the tags from this media item (but also make sure they've not already used this full title)

		media_collection = self.current_db.media

		media_item = media_collection.find_one({'_id': ObjectId(media_id) })

		if u'name' in media_item.keys() and media_item[u'name']:
			title = self._strip_tags(media_item[u'name'])

			self._check_and_insert_title(username, title)
			if len(title.split()) > 1:
				self._check_and_insert_terms(username, title.split())

	def _check_and_insert_title(self, username, title):

		# For each title, check if the whole thing exists.
		tag_collection = self.current_db.tags

		tag_item = tag_collection.find_one({'phrase': title })

		# If it doesn't, store the user_id alongside it
		if tag_item is None:
			tag_collection.insert({'phrase' : title, 'users' : [{'user' : username, 'count' : 1}]})
			return True

		else:

			#Else we've seen this phrase before (but not necessarily from this user)
			users = tag_item[u'users']
			existing_user = False

			for user in users:
				if user[u'user'] == username:
					existing_user = True
					break

			#If this user hasn't shown interest in this phrase before
			if not existing_user:
				users.append({'user' : username, 'count' : 1})
				tag_collection.update({u'_id' : tag_item[u'_id']}, {"$set": {'users' : users}}, upsert=True)
			else:
				return False

	def _check_and_insert_terms(self, username, terms):

		for term in terms:
			# For each term, check if the whole thing exists.
			tag_collection = self.current_db.tags

			tag_item = tag_collection.find_one({'phrase': term })

			# Term doesn't exist, so throw it in
			if tag_item is None:
				associations = self._get_associations_for_tag(term, terms, None)
				tag_collection.insert({'phrase' : term, 'users' : [{'user' : username, 'count' : 1}], 'associations' : associations })
			else:

				users = self._get_users_for_tag(term, tag_item[u'users'], username)
				associations = self._get_associations_for_tag(term, terms, tag_item)

				tag_collection.update({u'_id' : tag_item[u'_id']}, {"$set": {'users' : users}}, upsert=True)
				tag_collection.update({u'_id' : tag_item[u'_id']}, {"$set": {'associations' : associations}}, upsert=True)


	def _strip_tags(self, title):

		new_title = ''

		custom_corpus = PlaintextCorpusReader('../custom_corpora/', '.*')

		#For each word in the title
		for word in title.split():

			#Remove all punctuation
			noPunc = ''.join(c for c in word if c not in string.punctuation)

			#If this word isn't in stopwords and isn't just a single letter
			if noPunc.lower() not in (stopwords.words('english')) and len(noPunc) > 1:

				stripped_word = self._strip_word(word)

				if stripped_word not in (custom_corpus.words('media')) and len(stripped_word) > 1:
					new_title = ' '.join([new_title, stripped_word])

		return new_title[1:]

	def _strip_word(self, word):

		word = word.lower()
		word = ''.join(c for c in word if not c.isdigit())
		word = ''.join(c for c in word if c not in (string.punctuation[:6] + string.punctuation[7:]))

		return word

	def _get_users_for_tag(self, term, users, username):
		#Else we've seen this phrase before (but not necessarily from this user)
		existing_user = False

		for user in users:
			if user[u'user'] == username:
				existing_user = True
				break

		#If this user hasn't shown interest in this phrase before
		if not existing_user:
			users.append({'user' : username, 'count' : 1})
		else:
			for user in users:
				if user[u'user'] == username:
					user[u'count'] = int(user[u'count']) + 1
					break

		return users


	###############
	#
	#	term is the phrase whose association list we're modifying
	#	terms is the list of all terms in the current title
	#	prev_associations is the list of previous association values
	#
	###################
	def _get_associations_for_tag(self, term, terms, tag_item):

		if tag_item is None:
			prev_associations = None
		elif u'associations' in tag_item.keys():
			prev_associations = tag_item[u'associations']
		else:
			prev_associations = None

		associations = []

		if prev_associations is None:

			for word in terms:
				if term == word:
					continue
				associations.append({ 'phrase' : word, 'count' : 1})
		
			return associations

		else:
			for word in terms:
				#If we're looking at the term we're adding to, skip it
				if term == word:
					continue

				new_word = True
				for comparator in prev_associations:
					if comparator[u'phrase'] == word:
						new_word = False
						break

				if new_word:
					associations.append({'phrase' : word, 'count' : 1})
				else:
					for comparator in prev_associations:
						if comparator[u'phrase'] == word:
							associations.append({'phrase' : word, 'count' : int(comparator[u'count']) + 1})
							break

		return associations

	def _get_media_id(self, item):
		media_collection = self.current_db.media
		media_id = None

		for media in media_collection.find():
			if 'link' in item.keys():
				if item[u'link'] == media[u'link']:
					media_id = media[u'_id']
					break
			elif 'name' in item.keys():
				if item[u'name'] == media[u'name']:
					media_id = media[u'_id']
					break

		return media_id

	def media_exists(self, paged_media, user):
		if paged_media is None:
			return False

		media_collection = self.current_db.media
		user_media_collection = self.current_db.user_media

		for paged_item in paged_media:
			media_item = media_collection.find_one({'name': paged_item[u'name'] })
			
			if media_item is None:
				return False

			user_media_item = user_media_collection.find_one({'_id': ObjectId(media_item[u'_id']) })

			if user_media_item is None:
				return False
			else:

				user_ratings = user_media_item[u'user_ratings']

				for user_rating in user_ratings:
					if user_rating[u'user'] == user[u'_id']:
						return True

		return False

	def store_relation(self, relation):
		relation_collection = self.current_db.relations

		#Now grab said item (either new or existing) from the database
		relation_item = relation_collection.find_one({'user': ObjectId(relation[u'user']) })

		#If this user has no previous relation information
		if relation_item is None:
			relation_collection.insert({u'user' : relation[u'user'], u'direct' : relation[u'direct'], u'similar' : relation[u'similar']})
		
		#Else if a user already has a relation item
		#Replace it
		else:
			relation_collection.update({u'_id' : ObjectId(relation_item[u'_id'])}, {"$set": {u'direct' : relation[u'direct']}}, upsert=True)
			relation_collection.update({u'_id' : ObjectId(relation_item[u'_id'])}, {"$set": {u'similar' : relation[u'similar']}}, upsert=True)

	def store_rankings(self, relation, ranks):
		user_graphs_collection = self.current_db.user_graphs
		user_graphs_item = user_graphs_collection.find_one({'_id': ObjectId(relation[u'user']) })

		#If this user has no previous user_graphs information
		if user_graphs_item is None:
			user_graphs_collection.insert({u'_id' : relation[u'user'], 'ranks' : ranks})
		else:
			user_graphs_collection.update({u'_id' : ObjectId(user_graphs_item[u'_id'])}, {"$set": {'ranks' : ranks}}, upsert=True)

	def store_prioritised_media(self, user, media):

		staged_media_collection = self.current_db.staged_media
		staged_media_collection.remove({'user' : user['_id']})
		staged_media_collection.insert(media)