def build_context(args): ''' This creates a dictionary of variables we'll be using throughout the script. args are from parse_args ''' context = args currentdate = context['query_date'] currentyear = datetime.datetime.now().strftime("%Y") currentmonth = datetime.datetime.now().strftime("%m") input_filename = os.path.basename(context['s3_input']) output_base = context['filebase'] + '__' + currentdate + '__' + \ input_filename.replace('.csv', '') # local stuff context['currentdate'] = currentdate context['volume_directory'] = 'pylogs/' context['log'] = os.path.join( context['volume_directory'], output_base + '.log' ) # digital ocean if not context['token']: context['token'] = os.environ.get('DO_TOKEN') manager = digitalocean.Manager(token=context['token']) my_droplets = manager.get_all_droplets() vols = manager.get_all_volumes() mydrop = [_ for _ in my_droplets if _.ip_address == get_ip_address()][0] context['droplet'] = mydrop context['droplet_id'] = mydrop.id # s3 stuff if 's3://' not in context['s3_input']: raise "Improperly formatted -s3 or --s3-input flag" context['input'] = download_from_s3(context['s3_input'], new_dir='pylogs/') context['auth'] = 'pylogs/{}__{}__tokens.json'.format(mydrop.id, currentdate) context['s3_bucket'] = s3.get_bucket(context['s3_input']) context['s3_key'] = context['s3_input'].split('input/')[0] context['s3_path'] = os.path.join( context['s3_key'], 'output/user_meta_many/' #, currentyear, currentmonth, ) context['s3_log'] = os.path.join( 's3://' + context['s3_bucket'], 'logs', output_base + '.log' ) context['s3_log_done'] = os.path.join( context['s3_key'], 'logs/user_meta_many/', currentyear, currentmonth, output_base + '.log' ) context['s3_auth'] = os.path.join( 's3://' + context['s3_bucket'], 'tokens/used', os.path.basename(context['auth']) ) return context
def crawl(settings): logger.info('crawl started') try: connection = postgres.get_connection(settings) except: logger.error('could not connect to postgres db') else: bucket = s3.get_bucket(settings) queue = Queue(maxsize=0) session = requests.session() for thread_index in range(settings['threading']['num_threads']): worker = Thread(target=process_index, args=(queue, thread_index, session, connection, bucket)) worker.setDaemon(True) logger.info('thread %s > starting' % (thread_index)) worker.start() for index_page in range(settings['leboncoin']['start_page'], settings['leboncoin']['end_page']): queue.put('http://www.leboncoin.fr/ventes_immobilieres/offres/?o={page}'.format(page=index_page)) queue.join() logger.info('crawl ended')
#!/usr/bin/env python import s3 bucket_name = "d.defold.com" archive_root = "archive" bucket = s3.get_bucket(bucket_name) keep = [ "5791ee6d96b87e50eee5acd70abaa4026fefef28", #1.2.170 "4ebe7a1d548eae2398717ed46f9d7d1b103d5503", #1.2.169 "e22f6d2f81e7c53ebcbfefe703ff22ce5da252c0", #1.2.168 "96f7a5e4f617d5f6f4645f30a3e6ff656689435d", #1.2.167 "5295afb3878441fb12f497df8831148525dcfb10", #1.2.166 "6fac6e80f09ab297093e3ff65a7f45ad56e06e33", #1.2.165 "ff34def383f372b1f302916374310bd498105384", #1.2.171 beta "a98007b48691529b59fb099fc369d81518059d00", #1.2.171 editor-alpha (stable) "stable", "beta", "alpha", "dev", "editor-alpha" ] for key in bucket.list(prefix=archive_root): parts = key.name.split("/") sha1 = parts[1] if sha1 not in keep: print("Deleting %s" % key.name) key.delete()