class CountryDB(object): def __init__(self, mongo_host): self._db = Connection(mongo_host)['crunch']['country'] def save(self, country): self._db.save({'_id':country}) def increment(self, country): self._db.update({'_id':country}, {'$inc': {'c': 1}}, upsert=True)
class LocalTouristClassifier(): def __init__(self): self.tweets = Connection().tweetsDB.tweetsCollection self.tweetUsers = Connection().tweetsDB.tweetUsersCollection self.tweetUsers.ensure_index( [("loc", GEO2D )] ) self.photos = Connection().flickrDB.flickrCollection self.linked = Connection().linkedDB.linkedCollection API_KEY = 'dj0yJmk9UUY5TWxNMXBRb0M3JmQ9WVdrOVV6RlVOWFEzTjJzbWNHbzlNVGMzTVRBNE5EazJNZy0tJnM9Y29uc3VtZXJzZWNyZXQmeD0zYQ--' SHARED_SECRET = '92a96753c369996f18b6a2ef4a6b1b9c85de04f5' self.y = yql.TwoLegged(API_KEY, SHARED_SECRET) self.yqlCache = {} def unescape_html_chars(self, item): return item.replace("&", "&").replace(">", ">").replace("<", "<").replace(""", "\"") def classifyTwitter(self): for tweet in self.tweets.find({"place":self.place}): if tweet['fromUserID'] is not None: tweetUser = self.tweetUsers.find_one({'id':tweet['fromUserID']}) if tweetUser is not None: tweetUserLocation = tweetUser['location'] if tweetUserLocation is not None and tweetUser['loc'] is None: tweetUserLocation = tweetUserLocation.encode('utf-8') #print "%s || %s" % (tweetUserLocation, self.place) # we use the yqlCache local dictionary to use as few calls as possible if self.yqlCache.get(tweetUserLocation) is not None and self.yqlCache[tweetUserLocation] != 0: tweetUser['loc'] = self.yqlCache[tweetUserLocation] print 'cacheSuccess: %20s %15s %s' % (tweetUserLocation, tweetUser['id'], tweetUser['loc']) else: # send request out to YQL yqlQuery = 'select * from geo.placefinder where text="%s";' % tweetUserLocation try: yqlResult = self.y.execute(yqlQuery) if yqlResult.rows == []: # yql couldn't figure out where this is, so don't save a loc self.yqlCache[tweetUserLocation] = 0 print 'fail: %20s %s' % (tweetUserLocation, tweetUser['id']) else: # yql found a lat and lon, so let's tag it loc = [float(yqlResult.rows[0].get('latitude')), float(yqlResult.rows[0].get('longitude'))] tweetUser['loc'] = loc self.yqlCache[tweetUserLocation] = loc print 'success: %20s %15s %s' % (tweetUserLocation, tweetUser['id'], loc) except: print "Exception Detected:", sys.exc_info()[0] # ready to save user self.tweetUsers.save(tweetUser)
def fetch_extended(mongo_host, start_idx=None): ext_col = Connection(mongo_host)['crunch']['extended'] companies = [c['_id'] for c in get_companies_list(mongo_host, start_idx)] BATCH = 10 for s in range(0, len(companies), BATCH): print companies[s] urls = [COMPANY_ENDPOINT % companies[i] for i in range(s, s + BATCH)] results = _get_batch(urls) for result in results: data = result data['_id'] = result['permalink'] ext_col.save(data) print companies
def fetch_companies(mongo_host, start_idx=1): col = Connection(mongo_host)['crunch']['company'] TOTAL = 8200 BATCH = 10 for s in range(start_idx, TOTAL, BATCH): print s urls = [SEARCH_ENDPOINT % i for i in range(s, s + BATCH)] results_list = _get_batch(urls, 'results') for results in results_list: for result in results: data = result data['_id'] = result['permalink'] col.save(data)
class RngDbHandler(object): """ Database access handler """ def __init__(self, collection, db='rng', host=os.getenv('MONGO_HOST', 'localohst'), port=None): """ Init log handler and store the collection handle """ self.collection = Connection(host, port)[db][collection] def insert(self, record): """ Store the record to the collection. Async insert """ try: self.collection.save(record) except InvalidDocument, e: logging.error("Unable to save log record: %s", e.message)
class StatsDB(object): def __init__(self, mongo_host): self._db = Connection(mongo_host)['crunch']['company_stats'] def save(self, company, stats): self._db.save({'_id':company, 'data': stats}) def find(self, query=None, *args, **kwargs): end_ts = datetime(1995, 1, 1) query = query or {} query['data.founded_at'] = {'$gte': end_ts} query['data.category_code'] = {'$ne': None} query['data.number_of_employees'] = {'$gt': 0} query['data.total_money_raised'] = {'$gt': 0} for i in self._db.find(query, *args, **kwargs): if get_dotted(i, 'data.founded_at') >= end_ts: yield i raise StopIteration
class MongoEmitter(Emitter): def __init__(self): super(MongoEmitter, self).__init__() self._mongo = Connection()['k2']['locations'] def emit_record(self, record): doc = self._mongo.find_one({'code': record['msa_code']}) if doc: doc['ffiec'] = { 'low': record['low'], 'high': record['high'], 'avg': record['avg'], } self._mongo.save(doc) else: print "[%s] %s not found" % (record['msa_code'], record['name']) def done(self): pass
for i in tmp_items: if i['_id']<5000: items.append(i) else: runes.append(i) conn = Connection()['lol']['items'] import codecs ct = codecs.open('loltw_constant.py', 'w', 'utf-8') ct.write(codecs.BOM_UTF8) f = open('items.urls', 'w') ct.write('ITEM_NAME_LOOKUP={') for i in items: conn.save(i) f.write('url="http://na.leagueoflegends.com/sites/default/files/game_data/%s/content/item/%d.gif"\n' % (NA_VER, i['_id'])) f.write('out="../../static/img/items/%d.gif"\n' % (i['_id'])) f.write('create-dirs\n\n') ct.write("'%s': '%s'," % (i['_id'], i['displayname'].get('zh_TW'))) f.close() ct.write('}\n') conn = Connection()['lol']['runes'] f = open('runes.urls', 'w') ct.write('RUNE_NAME_LOOKUP={') for r in runes: conn.save(r) f.write('url="http://na.leagueoflegends.com/sites/default/files/game_data/%s/content/rune/%d.gif"\n' % (NA_VER, r['_id'])) f.write('out="../../static/img/runes/%d.gif"\n' % (r['_id']))
class Collective(object): class Adaptive(object): # See the XCS paper for details: Butz and Wilson, 2001 # http://citeseer.ist.psu.edu/old/700101.html N = population_size = 1000 B = learning_rate = .1 a = accuracy_slope = .1 e0 = error_minimum = 1 v = power_parameter = -5 g = discount_factor = .8 OGA = ga_threshold = 40 X = crossover_prob = .75 u = mutation_prob = .01 Odel = experience_threshold = 20 d = fitness_threshold = .01 Osub = subsumption_threshold = 50 P = mask_probability = .3 p1 = initial_prediction = 1 e1 = initial_error = .1 F1 = initial_fitness = .01 pexplr = exploration_probability = .25 Omna = coverage_threshold = 4 # Reduced learning rate for the environmental error Be = variance_learning_rate = .05 # Subsumption is probably not useful here. doGASubsumption = False doActionSetSubsumption = False # Todo: Consider weak refs for this singletons = {} def __new__(cls, table): try: result = cls.singletons[table] except KeyError: result = object.__new__(cls) cls.singletons[table] = result result.init(table) return result def init(self, table): # Not __init__, because that gets run too often. self.values = self.Adaptive() self.rules = self.retrieve(table) self.timestamp = 0 def retrieve(self, table): try: from pymongo import Connection except ImportError: rules = [] else: self.table = Connection().parang[table] rules = [Classifier(row) for row in self.table.find()] return rules def save(self, rule): # Called whenever a classifier is created or changed. if self.table: if rule.n > 0: uid = self.table.save(rule.values()) rule._id = uid elif rule._id: self.table.remove(rule._id) def generate(self, msg): self.timestamp += 1 results = defaultdict(list) for rule in self.rules: output = rule.matches(msg) if output is not None: results[output].append(rule) while sum(r.n for s in results for r in results[s]) < self.values.Omna: rule = self.coverage(msg, results) output = rule.matches(msg) results[output].append(rule) self.delete() actions = dict((key, sum(r.p * r.F for r in results[key]) / sum(r.F for r in results[key])) for key in results if results[key]) action = weighted_choice(actions) action_set = results[action] brigade = self.values.g * max(actions.values()) return action, action_set, brigade def coverage(self, msg, actions): r"Creates a new classifier to fit the under-represented message." # Ignore only P% of the pattern bits. mask = 0 for n in range(Classifier.bits): if random() > self.values.P: mask |= 1 << n while True: # Generate an action not in the match set. # This also guarantees that the new classifier is unique. action = randrange(1 << Classifier.bits) if action not in actions: break values = { "pattern": msg, "pattern_mask": mask, "output": action, "output_mask": 0, "prediction": self.values.p1, "error": self.values.e1, "fitness": self.values.F1, "experience": 0, "timestamp": self.timestamp, "setsize": len(actions), "numerosity": 1, } rule = Classifier(values) self.rules.append(rule) self.save(rule) return rule def update(self, action_set, bonus, msg): # Update the action set set_size = sum(rule.n for rule in action_set) if not set_size: # All classifiers have been deleted. # Continuing would cause division by zero errors. return # Factor out the error due to environmental changes ubar = min(bonus - rule.p for rule in action_set) accuracy = 0 for rule in action_set: rule.exp += 1 factor = max(1. / rule.exp, self.values.B) # Reordering these updates may help for more complex problems. rule.u += (ubar - rule.u) * self.values.Be rule.p += (bonus - rule.p) * factor err = abs(bonus - rule.p) - rule.u if err < 0: err = self.values.e0 rule.e += (err - rule.e) * factor rule.s += (set_size - rule.s) * factor if rule.e < self.values.e0: rule.k = 1 else: rule.k = self.values.a * (rule.e / self.values.e0) ** self.values.v accuracy += rule.k * rule.n # Update the fitness separately, using the total accuracy. for rule in action_set: rule.F += (rule.k * rule.n / accuracy - rule.F) * self.values.B for rule in action_set: self.save(rule) # Run the genetic algorithm every so often avetime = sum(r.ts * r.n for r in action_set) / set_size if self.timestamp - avetime > self.values.OGA: self.genetic(action_set, msg) def genetic(self, action_set, msg): # Set timestamps for future use for rule in action_set: rule.ts = self.timestamp # Choose two, based on their fitness values fitness = dict((rule, rule.F) for rule in action_set) first = weighted_choice(fitness).copy() second = weighted_choice(fitness).copy() if random() < self.values.X: self.crossover(first, second) self.mutate(first, msg) self.mutate(second, msg) self.insert(first) self.insert(second) self.delete() def crossover(self, first, second): x = randrange(Classifier.bits) y = randrange(Classifier.bits) if x > y: x, y = y, x mask = 0 for n in range(x, y + 1): mask |= 1 << n fp, fpm, fo, fom = first.unpack() sp, spm, so, som = second.unpack() # Swap the pattern, using the bitwise trick fp ^= sp & mask sp ^= fp & mask fp ^= sp & mask # Swap the pattern mask fpm ^= spm & mask spm ^= fpm & mask fpm ^= spm & mask first.pack(fp, fpm, fo, fom) second.pack(sp, spm, so, som) # Average out the performance measurements first.p = second.p = (first.p + second.p) / 2 first.e = second.e = (first.e + second.e) / 2 first.F = second.F = (first.F + second.F) / 2 def mutate(self, rule, msg): prob = self.values.u pattern, pattern_mask, output, output_mask = rule.unpack() for n in range(Classifier.bits): bit = 1 << n if random() < prob: # Mutate only within the matching niche pattern_mask ^= bit if msg & bit: pattern |= bit else: pattern &= ~bit if random() < prob: output ^= bit if random() < prob: output_mask ^= bit # Save the new values rule.pack(pattern, pattern_mask, output, output_mask) # Temporarily decrease fitness rule.F *= 0.1 def insert(self, rule): for r in self.rules: if r.chromosome == rule.chromosome: r.n += rule.n self.save(r) break else: self.rules.append(rule) self.save(rule) def delete(self): total = sum(rule.n for rule in self.rules) excess = total - self.values.N if excess < 1: return [] fitness = sum(rule.F for rule in self.rules) / total scores = dict((rule, self.unfitness(rule, fitness)) for rule in self.rules) deleted = [] while excess > 0: rule = weighted_choice(scores) rule.n -= 1 if rule.n <= 0: self.rules.remove(rule) del scores[rule] deleted.append(rule) self.save(rule) excess -= 1 return deleted def unfitness(self, rule, average): result = rule.n * rule.s if rule.exp > self.values.Odel and rule.F < average * self.values.d * rule.n: result *= average * rule.n / rule.F return result
# Grab file information # try: # meta = extractMetadata(createParser(media['path'])) # except KeyboardInterrupt: # raise # # except: # meta = None # logging.debug("Failed grabbing meta.", exc_info=3) poster_name = None if pt_url: # Download poster poster_name = "%s%s%s-%s.%s" % (dirname, os.sep, fn, "poster", "jpg") urlretrieve(pt_url, poster_name) movie["poster"] = poster_name backdrop_name = None if bd_url: # Download backdrop backdrop_name = "%s%s%s-%s.%s" % (dirname, os.sep, fn, "backdrop", "jpg") urlretrieve(bd_url, backdrop_name) movie["backdrop"] = backdrop_name movie["rating"] = ctx["rating"] # Update document movie["certification"] = ctx["certification"] movie["overview"] = ctx["overview"] movie["tagline"] = info["tagline"] movie["runtime"] = info["runtime"] movie["genres"] = [item["name"] for item in info["genres"]] coll.save(movie) # Store updates
if isinstance(size, int): w,h=img.size if w>h: ratio = w/float(size) new_size = ( size, int(h/ratio) ) elif w<h: ratio = h/float(size) new_size = ( int(w/ratio), size ) else: new_size = (size, size) return img.resize(new_size, Image.ANTIALIAS) for media in coll.find(): fn = media['path'] if os.path.exists(fn): orig = Image.open(fn) for scale, size in image_scales: scaled_fn = "%s.%s" % (fn,scale) if not scale in media: if not os.path.exists(scaled_fn): scaled_image = resize_lsma(orig, size) scaled_image.save(scaled_fn, 'jpeg') media[scale] = scaled_fn coll.save(media)
#!/usr/bin/python from pymongo import Connection from time import time from django.core.mail import send_mail from settings import * import os; os.environ['DJANGO_SETTINGS_MODULE'] = 'settings' compromises = Connection(host="127.0.0.1", port=27017)['compDB']['compromiseCollection'] for compromise in compromises.find({'type':'event'}): compromise_id = compromise["_id"] timeEvent = compromise["timestamp"] deltaTime = time() * 1000 - (timeEvent + 300000) if deltaTime > 0: for user in compromises.find({'idEvent':str(compromise_id)}): send_mail(EMAIL_SUBJECT_RESULT, (EMAIL_TEXT_RESULT % compromise_id), EMAIL_HOST_USER, [user["mail"]]) compromise["type"] = "protuxlo" compromises.save(compromise)
#!/usr/bin/env python #! coding: utf-8 """ 初始化数据库,添加五千万条数据。每条数据为一个list """ from pymongo import Connection coll = Connection("127.0.0.1", 27017)["test"]["test"] data = [2934234]*200 for i in xrange(50000000): dic = { "data": data, "uid": i, } coll.save(dic)