def handleTask(TASK, DATE): print("HANDLING: " + str(TASK)) ZIP_FILENAME = TASK[2].split('/')[-1] zipResponse = urllib2.urlopen(TASK[2]) zipContent = zipResponse.read() if not checkMd5Sum(zipContent, TASK[1]): reject(zipContent, ZIP_FILENAME, DATE) hdfs.write(REJECT_PATH + ZIP_FILENAME, zipContent) return else: hdfs.log(LOG_PATH, ZIP_FILENAME + ' has correct md5Sum value', False) saveFileAs(zipContent, ZIP_FILENAME) print("SAVED: " + ZIP_FILENAME) with zipfile.ZipFile(ZIP_FILENAME, 'r') as zip_ref: zip_ref.extractall('.') CSV_FILENAME = ZIP_FILENAME[0:-4] print("UNZIPPED: " + CSV_FILENAME) HDFS_PATH = ACCEPT_PATH + CSV_FILENAME[0:-4] + '.csv' readAndPutToHdfs(CSV_FILENAME, HDFS_PATH) checkpoint(CHECKPOINT_PATH, HDFS_PATH) print("CHECKPOINT: " + HDFS_PATH) remove(ZIP_FILENAME) remove(CSV_FILENAME)
def handle_task(task, run_control_date): print("HANDLING: " + str(task)) try: url = task[0] parsed_url = urlparse.urlparse(url) start_datetime = urlparse.parse_qs( parsed_url.query)['STARTDATETIME'][0] start_time = start_datetime[-6:] end_datetime = urlparse.parse_qs(parsed_url.query)['ENDDATETIME'][0] end_time = end_datetime[-6:] response = urllib2.urlopen(url) json_content = response.read() article_info_json = ARTICLE_INFO_JSON.replace( '{INTERVAL}', start_time + "-" + end_time) hdfs.write(article_info_json, json_content) except Exception as e: print(e) hdfs.log(API_EXTRACTION_LOG_FILE, 'Error {0} while working on task {1}'.format(e, task), False) return hdfs.append( API_EXTRACTION_CHECKPOINT_FILE, '{}|{}|{}'.format(url, datetime.datetime.now().strftime('%Y%m%d%H%M%S'), article_info_json))
def getCheckpointsList(path, RUN_CONTROL_DATE): EXPORT_FILE_SUFFIX = '.export.csv' path = path.replace('{DATE}', str(RUN_CONTROL_DATE)) if not hdfs.exists(path): return [] checkpointFileContent = hdfs.readFileAsString(path) checkpointList = [] for line in checkpointFileContent.split('\n')[1:]: if line == '': continue if not line.endswith(EXPORT_FILE_SUFFIX): errorMessage = str.format( '"%s" does not end with the suffix ".export.csv"', line) print(line) hdfs.log(LOG_PATH, errorMessage, True) else: splitted_line = line.split('/') pathDate = splitted_line[3] if pathDate == RUN_CONTROL_DATE: fileName = splitted_line[5].split('.')[0] checkpointList.append(fileName) return checkpointList
def enqueueTasks(TASK_LIST, LIST_NAME): que = redis.Redis(host=REDIS_URL, port=6379) hdfs.log(LOG_PATH, 'Connected to Redis', False) que.delete(LIST_NAME) for task in TASK_LIST: que.lpush(LIST_NAME, str(task)) hdfs.log(LOG_PATH, 'LeftPushed ' + str(task) + ' into ' + LIST_NAME + ' list', False) que.client_kill_filter(_id=que.client_id()) hdfs.log(LOG_PATH, 'Disconnected from Redis', False)
def enqueue_tasks(task_list): que = redis.Redis(host=REDIS_URL, port=6379) hdfs.log(API_EXTRACTION_LOG_FILE, 'Connected to Redis', False) for task in task_list: que.lpush(QUE_NAME, str(task)) hdfs.log(API_EXTRACTION_LOG_FILE, 'LeftPushed ' + str(task) + ' into ' + QUE_NAME + ' list', False) que.client_kill_filter(_id=que.client_id()) hdfs.log(API_EXTRACTION_LOG_FILE, 'Disconnected from Redis', False)
def getNewTasksList(CONTROL_DATE, CHECKPOINT_PATH, MASTREFILE_URL, end_hour=24): time_stamps_to_download = generate_daily_time_slots(CONTROL_DATE, end_hour) EXPORT_FILE_SUFFIX_MASTER_FILE = '.export.CSV.zip' checkPointList = getCheckpointsList(CHECKPOINT_PATH, CONTROL_DATE) masterFileReseponse = urllib2.urlopen(MASTREFILE_URL) masterFile = masterFileReseponse.read() counter = 0 taskList = [] for line in masterFile.split('\n'): gdeltRecord = line.split(' ') if len(gdeltRecord) != 3: hdfs.log(LOG_PATH, 'Invalid record in GDELT MASTER FILE', False) continue if CONTROL_DATE in gdeltRecord[2] and gdeltRecord[2].endswith( EXPORT_FILE_SUFFIX_MASTER_FILE): for slot in time_stamps_to_download: if slot in gdeltRecord[2]: hdfs.log( LOG_PATH, 'Found record with correct date ' + str(gdeltRecord), False) timestamp = gdeltRecord[2].split('/')[-1].split('.')[0] if timestamp in checkPointList: hdfs.log(LOG_PATH, 'Found in checkpoints' + str(gdeltRecord), False) else: taskList.append(gdeltRecord) counter = counter + 1 hdfs.log(LOG_PATH, '#' + str(counter) + ' tasks created.', False) return taskList
def reject(zipContent, fileName, DATE): print("REJECT: " + fileName) hdfs.log(LOG_PATH, 'Reject file "' + fileName + '"', True) hdfs.write(REJECT_PATH + fileName, zipContent)