def __init__(self, host='localhost', port=27017): self.host = host self.port = port self.client = MongoClient(host, port) self.yt_validator = YoutubeValidator(self.client) if current_env == 'dev': self.current_db = self.client.revolvr_development elif current_env == 'prod': self.current_db = self.client.revolvr
class MongoConnector: #Store the client, current database and current collection def __init__(self, host='localhost', port=27017): self.host = host self.port = port self.client = MongoClient(host, port) self.yt_validator = YoutubeValidator(self.client) if current_env == 'dev': self.current_db = self.client.revolvr_development elif current_env == 'prod': self.current_db = self.client.revolvr def insert_unassigned_media(self, all_media): for source in all_media: for item in source[1]: if 'youtube' in item['link']: if self.yt_validator.check_media(item['link']): self.process_media_item(item) else: self.process_media_item(item) def insert_media(self, username, all_media=None): if all_media is not None: for source in all_media: #For each media source, grab the media items for item in source[1]: if 'youtube' in item['link']: if self.yt_validator.check_media(item['link']): media_id = self.process_media_item(item) self.process_user_media_item(media_id, username) self.process_tag_item(media_id, username) else: media_id = self.process_media_item(item) self.process_user_media_item(media_id, username) self.process_tag_item(media_id, username) def process_media_item(self, item): media_collection = self.current_db.media #Check whether we already have the item media_id = self._get_media_id(item) #If we have no record of this, insert it if media_id is None: media_id = media_collection.insert(item) return media_id def process_user_media_item(self, media_id, username): user_media_collection = self.current_db.user_media #Now grab said item (either new or existing) from the database user_media_item = user_media_collection.find_one({'_id': ObjectId(media_id) }) #If this media has not previously been associated with a user, store it if user_media_item is None: user_media_collection.insert({u'_id' : media_id, 'user_ratings' : [{'user' : username, 'rating' : randint(1, 100)}]}) #Else if a user has already been associated with this media item; #If the current user is in the stored users, ignore #Else store it alongside previous users else: user_ratings = user_media_item[u'user_ratings'] for user_rating in user_ratings: existing_user = False if user_rating[u'user'] == username: existing_user = True break if not existing_user: user_ratings.append({'user' : username, 'rating' : randint(1, 100)}) user_media_collection.update({u'_id' : ObjectId(media_id)}, {"$set": {'user_ratings' : user_ratings}}, upsert=True) def process_tag_item(self, media_id, username): #Store the tags from this media item (but also make sure they've not already used this full title) media_collection = self.current_db.media media_item = media_collection.find_one({'_id': ObjectId(media_id) }) if u'name' in media_item.keys() and media_item[u'name']: title = self._strip_tags(media_item[u'name']) self._check_and_insert_title(username, title) if len(title.split()) > 1: self._check_and_insert_terms(username, title.split()) def _check_and_insert_title(self, username, title): # For each title, check if the whole thing exists. tag_collection = self.current_db.tags tag_item = tag_collection.find_one({'phrase': title }) # If it doesn't, store the user_id alongside it if tag_item is None: tag_collection.insert({'phrase' : title, 'users' : [{'user' : username, 'count' : 1}]}) return True else: #Else we've seen this phrase before (but not necessarily from this user) users = tag_item[u'users'] existing_user = False for user in users: if user[u'user'] == username: existing_user = True break #If this user hasn't shown interest in this phrase before if not existing_user: users.append({'user' : username, 'count' : 1}) tag_collection.update({u'_id' : tag_item[u'_id']}, {"$set": {'users' : users}}, upsert=True) else: return False def _check_and_insert_terms(self, username, terms): for term in terms: # For each term, check if the whole thing exists. tag_collection = self.current_db.tags tag_item = tag_collection.find_one({'phrase': term }) # Term doesn't exist, so throw it in if tag_item is None: associations = self._get_associations_for_tag(term, terms, None) tag_collection.insert({'phrase' : term, 'users' : [{'user' : username, 'count' : 1}], 'associations' : associations }) else: users = self._get_users_for_tag(term, tag_item[u'users'], username) associations = self._get_associations_for_tag(term, terms, tag_item) tag_collection.update({u'_id' : tag_item[u'_id']}, {"$set": {'users' : users}}, upsert=True) tag_collection.update({u'_id' : tag_item[u'_id']}, {"$set": {'associations' : associations}}, upsert=True) def _strip_tags(self, title): new_title = '' custom_corpus = PlaintextCorpusReader('../custom_corpora/', '.*') #For each word in the title for word in title.split(): #Remove all punctuation noPunc = ''.join(c for c in word if c not in string.punctuation) #If this word isn't in stopwords and isn't just a single letter if noPunc.lower() not in (stopwords.words('english')) and len(noPunc) > 1: stripped_word = self._strip_word(word) if stripped_word not in (custom_corpus.words('media')) and len(stripped_word) > 1: new_title = ' '.join([new_title, stripped_word]) return new_title[1:] def _strip_word(self, word): word = word.lower() word = ''.join(c for c in word if not c.isdigit()) word = ''.join(c for c in word if c not in (string.punctuation[:6] + string.punctuation[7:])) return word def _get_users_for_tag(self, term, users, username): #Else we've seen this phrase before (but not necessarily from this user) existing_user = False for user in users: if user[u'user'] == username: existing_user = True break #If this user hasn't shown interest in this phrase before if not existing_user: users.append({'user' : username, 'count' : 1}) else: for user in users: if user[u'user'] == username: user[u'count'] = int(user[u'count']) + 1 break return users ############### # # term is the phrase whose association list we're modifying # terms is the list of all terms in the current title # prev_associations is the list of previous association values # ################### def _get_associations_for_tag(self, term, terms, tag_item): if tag_item is None: prev_associations = None elif u'associations' in tag_item.keys(): prev_associations = tag_item[u'associations'] else: prev_associations = None associations = [] if prev_associations is None: for word in terms: if term == word: continue associations.append({ 'phrase' : word, 'count' : 1}) return associations else: for word in terms: #If we're looking at the term we're adding to, skip it if term == word: continue new_word = True for comparator in prev_associations: if comparator[u'phrase'] == word: new_word = False break if new_word: associations.append({'phrase' : word, 'count' : 1}) else: for comparator in prev_associations: if comparator[u'phrase'] == word: associations.append({'phrase' : word, 'count' : int(comparator[u'count']) + 1}) break return associations def _get_media_id(self, item): media_collection = self.current_db.media media_id = None for media in media_collection.find(): if 'link' in item.keys(): if item[u'link'] == media[u'link']: media_id = media[u'_id'] break elif 'name' in item.keys(): if item[u'name'] == media[u'name']: media_id = media[u'_id'] break return media_id def media_exists(self, paged_media, user): if paged_media is None: return False media_collection = self.current_db.media user_media_collection = self.current_db.user_media for paged_item in paged_media: media_item = media_collection.find_one({'name': paged_item[u'name'] }) if media_item is None: return False user_media_item = user_media_collection.find_one({'_id': ObjectId(media_item[u'_id']) }) if user_media_item is None: return False else: user_ratings = user_media_item[u'user_ratings'] for user_rating in user_ratings: if user_rating[u'user'] == user[u'_id']: return True return False def store_relation(self, relation): relation_collection = self.current_db.relations #Now grab said item (either new or existing) from the database relation_item = relation_collection.find_one({'user': ObjectId(relation[u'user']) }) #If this user has no previous relation information if relation_item is None: relation_collection.insert({u'user' : relation[u'user'], u'direct' : relation[u'direct'], u'similar' : relation[u'similar']}) #Else if a user already has a relation item #Replace it else: relation_collection.update({u'_id' : ObjectId(relation_item[u'_id'])}, {"$set": {u'direct' : relation[u'direct']}}, upsert=True) relation_collection.update({u'_id' : ObjectId(relation_item[u'_id'])}, {"$set": {u'similar' : relation[u'similar']}}, upsert=True) def store_rankings(self, relation, ranks): user_graphs_collection = self.current_db.user_graphs user_graphs_item = user_graphs_collection.find_one({'_id': ObjectId(relation[u'user']) }) #If this user has no previous user_graphs information if user_graphs_item is None: user_graphs_collection.insert({u'_id' : relation[u'user'], 'ranks' : ranks}) else: user_graphs_collection.update({u'_id' : ObjectId(user_graphs_item[u'_id'])}, {"$set": {'ranks' : ranks}}, upsert=True) def store_prioritised_media(self, user, media): staged_media_collection = self.current_db.staged_media staged_media_collection.remove({'user' : user['_id']}) staged_media_collection.insert(media)