Exemplo n.º 1
0
def record_error(message,  roster_row, browser=None, page_number_within_scrape='NO_PAGE_FOUND'):
    county = roster_row['County']
    state = roster_row['State']
    date_collected = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    filename = 'Errors/' + state + '/' + county + '/' + str(datetime.now().year) + '/' + datetime.now().strftime("%B")+'/'+ date_collected + '_page_{}.html'.format(page_number_within_scrape)
    print('Error on file: _%s_' % filename)
    logger.error('Error on file: _%s_', filename)
    if not browser:
        sns_message = {
                "County": county,
                "State": state,
                "Message": message
                }
        logger.error('Error message: _%s_', sns_message)

    try:
        s3.Object(BUCKET,filename).put(Body=browser.page_source)
    except:
        logger.warning("No browser defined, so no error page saved")
    sns_message = {
            "County": county,
            "State": state,
            "Message": message,
            "Traceback": traceback.format_exc(),
            }
    logger.error('Error message: _%s_', sns_message)
    #sio = io.StringIO()
    #print(json.dumps(sns_message), file=sio)
    #sio.seek(0)
    sns = boto3.client('sns')
    response = sns.publish(
                TargetArn=FAILURE_SNS_TOPIC,
                Message=json.dumps(sns_message, indent=2)
    )
Exemplo n.º 2
0
def save_to_s3(page_data, page_number_within_scrape, roster_row, filetype='html'):
    county = roster_row['County']
    state = roster_row['State']
    date_collected = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    if filetype == 'pdf':
        filename = state + '/' + county + '/' + str(datetime.now().year) + '/' + datetime.now().strftime("%B")+'/'+ date_collected + '_page_{}.pdf'.format(page_number_within_scrape)
    elif filetype == 'xls':
        filename = state + '/' + county + '/' + str(datetime.now().year) + '/' + datetime.now().strftime("%B")+'/'+ date_collected + '_page_{}.xls'.format(page_number_within_scrape)
    else:
        filename = state + '/' + county + '/' + str(datetime.now().year) + '/' + datetime.now().strftime("%B")+'/'+ date_collected + '_page_{}.html'.format(page_number_within_scrape)
    print(filename)
    logger.info('Saved file: _%s_', filename)
    s3.Object(BUCKET,filename).put(Body=page_data)
def main(urlAddress):
    try:
        #urlAddress = roster['Working Link'].values[index]

        req = requests.get(urlAddress)
        pdf_data = req.content

        #Mark the time the file is collected
        date_collected = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        #Create an html file with the name of the time stamp collected and write the page to a folder
        #Within the root directory. Replace county and city with the county and city you are working on.

        #Toggle local/s3 storage
        aws = True

        if aws == True:
            s3.Object(
                'jailcrawl',
                state + '/' + county + '/' + str(datetime.now().year) + '/' +
                datetime.now().strftime("%B") + '/' + date_collected +
                '.pdf').put(Body=pdf_data)
        else:
            if not os.path.exists('./Scrapings/{}/{}/'.format(state, county)):
                os.makedirs('./Scrapings/{}/{}/'.format(state, county))

            file_ = open(
                root_directory + '/Scrapings/{}/{}/'.format(state, county) +
                date_collected + '.pdf', 'wb')
            file_.write(pdf_data)
            file_.close()  #close the writing of the file

    except Exception as errorMessage:
        #Post error to firebase server
        date_collected = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        data = {'Message': str(errorMessage)}
        firebase.put('/ErrorLogs/' + locationInUse + '/', date_collected, data)
        pass
Exemplo n.º 4
0
TeamNumber=configdata["team"]

bucket_name = str(TeamNumber) + configdata["state"].lower() + 'assignment1'
print bucket_name
log_entry("S3 bucket has been successfully created.")
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
print conn
bucket = conn.create_bucket(bucket_name, location=boto.s3.connection.Location.DEFAULT)

filename_base_data=configdata["state"]+"_"+configdata["StationId"]
s3 = boto3.resource('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
file = filename_base_data+".csv"
exists = False

try:
    s3.Object(bucket_name, file).load()
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        exists = False
        print exists
    else:
        raise
else:
    exists = True
    print exists

if exists==False:
    Listlinks=configdata["basedata_links"]
    length=len(Listlinks)
    for counter in range(0,length):
        link= Listlinks[counter]
Exemplo n.º 5
0
import boto3
import boto.s3
import sys
from boto.s3.key import Key

print "Connecting to S3"

#Should be kept secret, but these are for the ttds-group-user account,
#who is in the ttds-team group which only has s3 access - so an attacker
#can't do tooo much damage I don't think
AWS_ACCESS_KEY_ID = ''
AWS_SECRET_ACCESS_KEY = ''

s3 = boto3.resource('s3',
                    aws_access_key_id=AWS_ACCESS_KEY_ID,
                    aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

test_binary_data = b'This is some text data from S3 :)'

object = s3.Object('test-bucket-ttdsgroup', 'testfile.txt')
object.put(Body=test_binary_data)
def main():
    #retrieve argument
    args = parse_arguments()
    main_directory = args.directory
    class1 = args.class1
    class2 = args.class2
    force_by_user = args.force
    if args.verbose:
        lg.basicConfig(level=lg.INFO)

    #Variables declaration
    result = []
    directory_feature = os.path.join(main_directory, "features", "*.json")
    nb_training_data = args.nb_training_data
    iteration_model = args.iteration_model
    min_partition = args.min_partition
    s3 = boto3.resource('s3')
    bucket = s3.Bucket('oc-calculdistribues-sberton')
    result_file = class1 + '_' + class2 + '_' + time.strftime(
        "%Y%m%d%H%M%S") + '.json'
    model_file = 'model_' + class1 + '_' + class2 + '_' + str(
        nb_training_data) + '_' + str(iteration_model)
    model_pathname = os.path.join(main_directory, "models", model_file)

    #Searching existing model and store existence in is_model boolean
    key = 'distributed_learning/models/' + model_file
    objs = list(bucket.objects.filter(Prefix=key))
    is_model = len(objs) > 0 and objs[0].key.startswith(key + '/')

    start_time = time.time()
    lg.info(
        '#################### Starting pet-classification ######################'
    )
    lg.info('Class 1 is %s', class1)
    lg.info('Class 2 is %s', class2)
    lg.info('Number of training datas is %s', nb_training_data)
    lg.info('Number of iterations model is %s', iteration_model)

    #persist a common rdd which is using by both training and testing datas
    common_rdd = sc.textFile(directory_feature, minPartitions=min_partition)\
                   .filter(lambda line: line.split(', ')[0] in (class1, class2) or class2 == 'All')\
                   .persist()

    #Loading model if exists
    if is_model and not force_by_user:
        model = SVMModel.load(sc, model_pathname)
        lg.info('Found and load recorded model %s', model_file)
    else:
        lg.info('No recorded model found')
        #create training rdd and train model if no model found or force
        train_data_rdd = common_rdd.filter(lambda line: int(line.split(',')[1]) <= nb_training_data)\
                                   .map(lambda line: Row(label=0.0, features=line.split(', ')[2:])
                                        if line.split(', ')[0] == class1
                                        else Row(label=1.0, features=line.split(', ')[2:]))\
                                   .map(lambda line: LabeledPoint(line.label, line.features))

        lg.info('%s features for training datas', train_data_rdd.count())
        lg.info('Start to training model')
        model = SVMWithSGD.train(train_data_rdd, iterations=iteration_model)
        lg.info('Training model terminated')

    training_time = time.time()
    training_duration = training_time - start_time
    #Create testing rdd
    test_data_rdd = common_rdd.filter(lambda line: int(line.split(', ')[1]) > nb_training_data)\
                      .map(lambda line: Row(label=0.0, features=line.split(', ')[2:])
                                           if line.split(', ')[0] == class1
                                           else Row(label=1.0, features=line.split(', ')[2:]))\
                      .map(lambda row: LabeledPoint(row.label, row.features))
    lg.info('%s features for test datas', test_data_rdd.count())

    # Evaluating the model on training data
    predictions = test_data_rdd.map(
        lambda row: (row.label, float(model.predict(row.features))))
    train_error = predictions.filter(lambda lp: lp[0] != lp[1]).count() \
                                     / float(predictions.count())
    lg.info('Test Error : %s', str(train_error))
    end_time = time.time()
    duration = end_time - start_time
    lg.info('Duration %s', str(duration))
    prediction_duration = end_time - training_time
    # #Save and dump result on S3
    result.append({
        "class1": class1,
        "class2": class2,
        "iteration_model": iteration_model,
        "nb_training_data": nb_training_data,
        "total_duration": duration,
        "train_duration": training_duration,
        "predict_duration": prediction_duration,
        "error": train_error
    })

    s3object = s3.Object('oc-calculdistribues-sberton', result_file)
    s3object.put(Body=(bytes(json.dumps(result, indent=2).encode('UTF-8'))))

    #Save model if not exists
    if not is_model:
        lg.info('Saving model at %s', model_file)
        model.save(sc, model_pathname)

    lg.info(
        '#################### Ending pet-classification ######################'
    )