Exemplo n.º 1
0
def handleTask(TASK, DATE):
    print("HANDLING: " + str(TASK))
    ZIP_FILENAME = TASK[2].split('/')[-1]

    zipResponse = urllib2.urlopen(TASK[2])
    zipContent = zipResponse.read()
    if not checkMd5Sum(zipContent, TASK[1]):
        reject(zipContent, ZIP_FILENAME, DATE)
        hdfs.write(REJECT_PATH + ZIP_FILENAME, zipContent)
        return
    else:
        hdfs.log(LOG_PATH, ZIP_FILENAME + ' has correct md5Sum value', False)

    saveFileAs(zipContent, ZIP_FILENAME)
    print("SAVED: " + ZIP_FILENAME)
    with zipfile.ZipFile(ZIP_FILENAME, 'r') as zip_ref:
        zip_ref.extractall('.')
    CSV_FILENAME = ZIP_FILENAME[0:-4]
    print("UNZIPPED: " + CSV_FILENAME)

    HDFS_PATH = ACCEPT_PATH + CSV_FILENAME[0:-4] + '.csv'
    readAndPutToHdfs(CSV_FILENAME, HDFS_PATH)
    checkpoint(CHECKPOINT_PATH, HDFS_PATH)
    print("CHECKPOINT: " + HDFS_PATH)
    remove(ZIP_FILENAME)
    remove(CSV_FILENAME)
Exemplo n.º 2
0
def handle_task(task, run_control_date):
    print("HANDLING: " + str(task))

    try:
        url = task[0]
        parsed_url = urlparse.urlparse(url)

        start_datetime = urlparse.parse_qs(
            parsed_url.query)['STARTDATETIME'][0]
        start_time = start_datetime[-6:]
        end_datetime = urlparse.parse_qs(parsed_url.query)['ENDDATETIME'][0]
        end_time = end_datetime[-6:]

        response = urllib2.urlopen(url)
        json_content = response.read()

        article_info_json = ARTICLE_INFO_JSON.replace(
            '{INTERVAL}', start_time + "-" + end_time)

        hdfs.write(article_info_json, json_content)
    except Exception as e:
        print(e)
        hdfs.log(API_EXTRACTION_LOG_FILE,
                 'Error {0} while working on task {1}'.format(e, task), False)
        return

    hdfs.append(
        API_EXTRACTION_CHECKPOINT_FILE,
        '{}|{}|{}'.format(url,
                          datetime.datetime.now().strftime('%Y%m%d%H%M%S'),
                          article_info_json))
Exemplo n.º 3
0
def getCheckpointsList(path, RUN_CONTROL_DATE):
    EXPORT_FILE_SUFFIX = '.export.csv'
    path = path.replace('{DATE}', str(RUN_CONTROL_DATE))
    if not hdfs.exists(path):
        return []

    checkpointFileContent = hdfs.readFileAsString(path)
    checkpointList = []
    for line in checkpointFileContent.split('\n')[1:]:
        if line == '':
            continue

        if not line.endswith(EXPORT_FILE_SUFFIX):
            errorMessage = str.format(
                '"%s" does not end with the suffix ".export.csv"', line)
            print(line)
            hdfs.log(LOG_PATH, errorMessage, True)
        else:
            splitted_line = line.split('/')
            pathDate = splitted_line[3]
            if pathDate == RUN_CONTROL_DATE:
                fileName = splitted_line[5].split('.')[0]
                checkpointList.append(fileName)
    return checkpointList
Exemplo n.º 4
0
def enqueueTasks(TASK_LIST, LIST_NAME):
    que = redis.Redis(host=REDIS_URL, port=6379)
    hdfs.log(LOG_PATH, 'Connected to Redis', False)
    que.delete(LIST_NAME)
    for task in TASK_LIST:
        que.lpush(LIST_NAME, str(task))
        hdfs.log(LOG_PATH,
                 'LeftPushed ' + str(task) + ' into ' + LIST_NAME + ' list',
                 False)
    que.client_kill_filter(_id=que.client_id())
    hdfs.log(LOG_PATH, 'Disconnected from Redis', False)
Exemplo n.º 5
0
def enqueue_tasks(task_list):
    que = redis.Redis(host=REDIS_URL, port=6379)

    hdfs.log(API_EXTRACTION_LOG_FILE, 'Connected to Redis', False)

    for task in task_list:
        que.lpush(QUE_NAME, str(task))
        hdfs.log(API_EXTRACTION_LOG_FILE,
                 'LeftPushed ' + str(task) + ' into ' + QUE_NAME + ' list',
                 False)

    que.client_kill_filter(_id=que.client_id())

    hdfs.log(API_EXTRACTION_LOG_FILE, 'Disconnected from Redis', False)
Exemplo n.º 6
0
def getNewTasksList(CONTROL_DATE,
                    CHECKPOINT_PATH,
                    MASTREFILE_URL,
                    end_hour=24):
    time_stamps_to_download = generate_daily_time_slots(CONTROL_DATE, end_hour)

    EXPORT_FILE_SUFFIX_MASTER_FILE = '.export.CSV.zip'

    checkPointList = getCheckpointsList(CHECKPOINT_PATH, CONTROL_DATE)
    masterFileReseponse = urllib2.urlopen(MASTREFILE_URL)
    masterFile = masterFileReseponse.read()
    counter = 0
    taskList = []
    for line in masterFile.split('\n'):
        gdeltRecord = line.split(' ')
        if len(gdeltRecord) != 3:
            hdfs.log(LOG_PATH, 'Invalid record in GDELT MASTER FILE', False)
            continue
        if CONTROL_DATE in gdeltRecord[2] and gdeltRecord[2].endswith(
                EXPORT_FILE_SUFFIX_MASTER_FILE):
            for slot in time_stamps_to_download:
                if slot in gdeltRecord[2]:
                    hdfs.log(
                        LOG_PATH,
                        'Found record with correct date ' + str(gdeltRecord),
                        False)
                    timestamp = gdeltRecord[2].split('/')[-1].split('.')[0]
                    if timestamp in checkPointList:
                        hdfs.log(LOG_PATH,
                                 'Found in checkpoints' + str(gdeltRecord),
                                 False)
                    else:
                        taskList.append(gdeltRecord)
                        counter = counter + 1
    hdfs.log(LOG_PATH, '#' + str(counter) + ' tasks created.', False)
    return taskList
Exemplo n.º 7
0
def reject(zipContent, fileName, DATE):
    print("REJECT: " + fileName)
    hdfs.log(LOG_PATH, 'Reject file "' + fileName + '"', True)
    hdfs.write(REJECT_PATH + fileName, zipContent)