def record_error(message, roster_row, browser=None, page_number_within_scrape='NO_PAGE_FOUND'): county = roster_row['County'] state = roster_row['State'] date_collected = datetime.now().strftime("%Y-%m-%d %H:%M:%S") filename = 'Errors/' + state + '/' + county + '/' + str(datetime.now().year) + '/' + datetime.now().strftime("%B")+'/'+ date_collected + '_page_{}.html'.format(page_number_within_scrape) print('Error on file: _%s_' % filename) logger.error('Error on file: _%s_', filename) if not browser: sns_message = { "County": county, "State": state, "Message": message } logger.error('Error message: _%s_', sns_message) try: s3.Object(BUCKET,filename).put(Body=browser.page_source) except: logger.warning("No browser defined, so no error page saved") sns_message = { "County": county, "State": state, "Message": message, "Traceback": traceback.format_exc(), } logger.error('Error message: _%s_', sns_message) #sio = io.StringIO() #print(json.dumps(sns_message), file=sio) #sio.seek(0) sns = boto3.client('sns') response = sns.publish( TargetArn=FAILURE_SNS_TOPIC, Message=json.dumps(sns_message, indent=2) )
def save_to_s3(page_data, page_number_within_scrape, roster_row, filetype='html'): county = roster_row['County'] state = roster_row['State'] date_collected = datetime.now().strftime("%Y-%m-%d %H:%M:%S") if filetype == 'pdf': filename = state + '/' + county + '/' + str(datetime.now().year) + '/' + datetime.now().strftime("%B")+'/'+ date_collected + '_page_{}.pdf'.format(page_number_within_scrape) elif filetype == 'xls': filename = state + '/' + county + '/' + str(datetime.now().year) + '/' + datetime.now().strftime("%B")+'/'+ date_collected + '_page_{}.xls'.format(page_number_within_scrape) else: filename = state + '/' + county + '/' + str(datetime.now().year) + '/' + datetime.now().strftime("%B")+'/'+ date_collected + '_page_{}.html'.format(page_number_within_scrape) print(filename) logger.info('Saved file: _%s_', filename) s3.Object(BUCKET,filename).put(Body=page_data)
def main(urlAddress): try: #urlAddress = roster['Working Link'].values[index] req = requests.get(urlAddress) pdf_data = req.content #Mark the time the file is collected date_collected = datetime.now().strftime("%Y-%m-%d %H:%M:%S") #Create an html file with the name of the time stamp collected and write the page to a folder #Within the root directory. Replace county and city with the county and city you are working on. #Toggle local/s3 storage aws = True if aws == True: s3.Object( 'jailcrawl', state + '/' + county + '/' + str(datetime.now().year) + '/' + datetime.now().strftime("%B") + '/' + date_collected + '.pdf').put(Body=pdf_data) else: if not os.path.exists('./Scrapings/{}/{}/'.format(state, county)): os.makedirs('./Scrapings/{}/{}/'.format(state, county)) file_ = open( root_directory + '/Scrapings/{}/{}/'.format(state, county) + date_collected + '.pdf', 'wb') file_.write(pdf_data) file_.close() #close the writing of the file except Exception as errorMessage: #Post error to firebase server date_collected = datetime.now().strftime("%Y-%m-%d %H:%M:%S") data = {'Message': str(errorMessage)} firebase.put('/ErrorLogs/' + locationInUse + '/', date_collected, data) pass
TeamNumber=configdata["team"] bucket_name = str(TeamNumber) + configdata["state"].lower() + 'assignment1' print bucket_name log_entry("S3 bucket has been successfully created.") conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) print conn bucket = conn.create_bucket(bucket_name, location=boto.s3.connection.Location.DEFAULT) filename_base_data=configdata["state"]+"_"+configdata["StationId"] s3 = boto3.resource('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) file = filename_base_data+".csv" exists = False try: s3.Object(bucket_name, file).load() except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": exists = False print exists else: raise else: exists = True print exists if exists==False: Listlinks=configdata["basedata_links"] length=len(Listlinks) for counter in range(0,length): link= Listlinks[counter]
import boto3 import boto.s3 import sys from boto.s3.key import Key print "Connecting to S3" #Should be kept secret, but these are for the ttds-group-user account, #who is in the ttds-team group which only has s3 access - so an attacker #can't do tooo much damage I don't think AWS_ACCESS_KEY_ID = '' AWS_SECRET_ACCESS_KEY = '' s3 = boto3.resource('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) test_binary_data = b'This is some text data from S3 :)' object = s3.Object('test-bucket-ttdsgroup', 'testfile.txt') object.put(Body=test_binary_data)
def main(): #retrieve argument args = parse_arguments() main_directory = args.directory class1 = args.class1 class2 = args.class2 force_by_user = args.force if args.verbose: lg.basicConfig(level=lg.INFO) #Variables declaration result = [] directory_feature = os.path.join(main_directory, "features", "*.json") nb_training_data = args.nb_training_data iteration_model = args.iteration_model min_partition = args.min_partition s3 = boto3.resource('s3') bucket = s3.Bucket('oc-calculdistribues-sberton') result_file = class1 + '_' + class2 + '_' + time.strftime( "%Y%m%d%H%M%S") + '.json' model_file = 'model_' + class1 + '_' + class2 + '_' + str( nb_training_data) + '_' + str(iteration_model) model_pathname = os.path.join(main_directory, "models", model_file) #Searching existing model and store existence in is_model boolean key = 'distributed_learning/models/' + model_file objs = list(bucket.objects.filter(Prefix=key)) is_model = len(objs) > 0 and objs[0].key.startswith(key + '/') start_time = time.time() lg.info( '#################### Starting pet-classification ######################' ) lg.info('Class 1 is %s', class1) lg.info('Class 2 is %s', class2) lg.info('Number of training datas is %s', nb_training_data) lg.info('Number of iterations model is %s', iteration_model) #persist a common rdd which is using by both training and testing datas common_rdd = sc.textFile(directory_feature, minPartitions=min_partition)\ .filter(lambda line: line.split(', ')[0] in (class1, class2) or class2 == 'All')\ .persist() #Loading model if exists if is_model and not force_by_user: model = SVMModel.load(sc, model_pathname) lg.info('Found and load recorded model %s', model_file) else: lg.info('No recorded model found') #create training rdd and train model if no model found or force train_data_rdd = common_rdd.filter(lambda line: int(line.split(',')[1]) <= nb_training_data)\ .map(lambda line: Row(label=0.0, features=line.split(', ')[2:]) if line.split(', ')[0] == class1 else Row(label=1.0, features=line.split(', ')[2:]))\ .map(lambda line: LabeledPoint(line.label, line.features)) lg.info('%s features for training datas', train_data_rdd.count()) lg.info('Start to training model') model = SVMWithSGD.train(train_data_rdd, iterations=iteration_model) lg.info('Training model terminated') training_time = time.time() training_duration = training_time - start_time #Create testing rdd test_data_rdd = common_rdd.filter(lambda line: int(line.split(', ')[1]) > nb_training_data)\ .map(lambda line: Row(label=0.0, features=line.split(', ')[2:]) if line.split(', ')[0] == class1 else Row(label=1.0, features=line.split(', ')[2:]))\ .map(lambda row: LabeledPoint(row.label, row.features)) lg.info('%s features for test datas', test_data_rdd.count()) # Evaluating the model on training data predictions = test_data_rdd.map( lambda row: (row.label, float(model.predict(row.features)))) train_error = predictions.filter(lambda lp: lp[0] != lp[1]).count() \ / float(predictions.count()) lg.info('Test Error : %s', str(train_error)) end_time = time.time() duration = end_time - start_time lg.info('Duration %s', str(duration)) prediction_duration = end_time - training_time # #Save and dump result on S3 result.append({ "class1": class1, "class2": class2, "iteration_model": iteration_model, "nb_training_data": nb_training_data, "total_duration": duration, "train_duration": training_duration, "predict_duration": prediction_duration, "error": train_error }) s3object = s3.Object('oc-calculdistribues-sberton', result_file) s3object.put(Body=(bytes(json.dumps(result, indent=2).encode('UTF-8')))) #Save model if not exists if not is_model: lg.info('Saving model at %s', model_file) model.save(sc, model_pathname) lg.info( '#################### Ending pet-classification ######################' )