def handle_task(task, run_control_date): print("HANDLING: " + str(task)) try: url = task[0] parsed_url = urlparse.urlparse(url) start_datetime = urlparse.parse_qs( parsed_url.query)['STARTDATETIME'][0] start_time = start_datetime[-6:] end_datetime = urlparse.parse_qs(parsed_url.query)['ENDDATETIME'][0] end_time = end_datetime[-6:] response = urllib2.urlopen(url) json_content = response.read() article_info_json = ARTICLE_INFO_JSON.replace( '{INTERVAL}', start_time + "-" + end_time) hdfs.write(article_info_json, json_content) except Exception as e: print(e) hdfs.log(API_EXTRACTION_LOG_FILE, 'Error {0} while working on task {1}'.format(e, task), False) return hdfs.append( API_EXTRACTION_CHECKPOINT_FILE, '{}|{}|{}'.format(url, datetime.datetime.now().strftime('%Y%m%d%H%M%S'), article_info_json))
def handleTask(TASK, DATE): print("HANDLING: " + str(TASK)) ZIP_FILENAME = TASK[2].split('/')[-1] zipResponse = urllib2.urlopen(TASK[2]) zipContent = zipResponse.read() if not checkMd5Sum(zipContent, TASK[1]): reject(zipContent, ZIP_FILENAME, DATE) hdfs.write(REJECT_PATH + ZIP_FILENAME, zipContent) return else: hdfs.log(LOG_PATH, ZIP_FILENAME + ' has correct md5Sum value', False) saveFileAs(zipContent, ZIP_FILENAME) print("SAVED: " + ZIP_FILENAME) with zipfile.ZipFile(ZIP_FILENAME, 'r') as zip_ref: zip_ref.extractall('.') CSV_FILENAME = ZIP_FILENAME[0:-4] print("UNZIPPED: " + CSV_FILENAME) HDFS_PATH = ACCEPT_PATH + CSV_FILENAME[0:-4] + '.csv' readAndPutToHdfs(CSV_FILENAME, HDFS_PATH) checkpoint(CHECKPOINT_PATH, HDFS_PATH) print("CHECKPOINT: " + HDFS_PATH) remove(ZIP_FILENAME) remove(CSV_FILENAME)
def set_up(max_hour): print("Setting up {}".format(RUN_CONTROL_DATE_PATH)) delete_recursively(RUN_CONTROL_DATE_PATH) write(RUN_CONTROL_DATE_PATH, Config.RUN_CONTROL_DATE) print("Setting up dictionaries {}".format(DICTIONARIES_PATH)) subprocess.check_output('python2.7 ../../acquisition/get_dictionaries.py', shell=True) # download api print("Setting up api {} for hour time slot [0, {}]".format(ARTICLE_INFO_JSON, max_hour)) subprocess.check_output('python2.7 ../../acquisition/generate_tasks_api.py {}'.format(max_hour), shell=True) subprocess.check_output('python2.7 ../../acquisition/download_api.py', shell=True) for path in listPath(ARTICLE_INFO_JSON): print("\t {}".format(path)) # download csv print("Setting up csv {} for hour time slot [0, {}]".format(ARTICLE_CSV, max_hour)) subprocess.check_output('python2.7 ../../acquisition/generate_tasks.py {}'.format(max_hour), shell=True) subprocess.check_output('python2.7 ../../acquisition/download_csv.py', shell=True) for path in listPath(ARTICLE_CSV): print("\t {}".format(path)) # csv distinct print("Running csv-distinct") subprocess.check_output('../run_processing.sh distinct ../csvdistinct/target/csv-distinct-1.0-SNAPSHOT.jar', shell=True) # map country print("Running country-mapping") subprocess.check_output('../run_processing.sh country ../country-mapping/target/country-mapping-1.0-SNAPSHOT.jar', shell=True)
def getStopWords(): URL = 'https://raw.githubusercontent.com/aneesha/RAKE/master/SmartStoplist.txt' path = '/tech/STOPWORDS.txt' if hdfs.exists(path): return response = urllib2.urlopen(URL) content = response.read() stopWords = content.split('\n')[1:] hdfs.write(path, '\n'.join(stopWords))
def getCountries(): URL = 'https://raw.githubusercontent.com/mysociety/gaze/master/data/fips-10-4-to-iso-country-codes.csv' path = hdfsPath + FILE_NAME.replace('{type}', 'country') if hdfs.exists(path): return cameoResponse = urllib2.urlopen(URL) cameoContent = cameoResponse.read() countries = [] for line in cameoContent.split('\n')[1:]: splitted = line.split(',') if len(splitted) == 3: countries.append(splitted[0] + '\t' + splitted[2]) hdfs.write(path, '\n'.join(countries))
hdfs.log(API_EXTRACTION_LOG_FILE, 'LeftPushed ' + str(task) + ' into ' + QUE_NAME + ' list', False) que.client_kill_filter(_id=que.client_id()) hdfs.log(API_EXTRACTION_LOG_FILE, 'Disconnected from Redis', False) # log if not hdfs.exists(API_EXTRACTION_LOG_DIR): hdfs.mkdir(API_EXTRACTION_LOG_DIR) if not hdfs.exists(API_EXTRACTION_LOG_FILE): hdfs.touch(API_EXTRACTION_LOG_FILE) # checkpoint if not hdfs.exists(API_EXTRACTION_CHECKPOINT_DIR): hdfs.mkdir(API_EXTRACTION_CHECKPOINT_DIR) if not hdfs.exists(API_EXTRACTION_CHECKPOINT_FILE): hdfs.write(API_EXTRACTION_CHECKPOINT_FILE, 'API_URL|FINISH_DATETIME|FILE_LOCATION') if len(sys.argv) > 1: print("Max hour {}".format(sys.argv[1])) new_tasks = get_new_tasks_list(sys.argv[1]) else: print("Max hour {}".format(24)) new_tasks = get_new_tasks_list() enqueue_tasks(new_tasks)
def readAndPutToHdfs(path, hdfs_path): file = open(path, 'r') fileContent = file.read() hdfs.write(hdfs_path, fileContent) file.close()
def reject(zipContent, fileName, DATE): print("REJECT: " + fileName) hdfs.log(LOG_PATH, 'Reject file "' + fileName + '"', True) hdfs.write(REJECT_PATH + fileName, zipContent)
return taskList if not hdfs.exists(RUN_CONTROL_PATH): raise Exception('There is not tech file in ' + str(RUN_CONTROL_PATH)) DATE = hdfs.readFileAsString(RUN_CONTROL_PATH) if DATE.endswith('\n'): DATE = DATE[:-1] if not hdfs.exists(LOG_PATH): hdfs.touch(LOG_PATH) REJECT_PATH = REJECT_PATH.replace('{DATE}', DATE) ACCEPT_PATH = ACCEPT_PATH.replace('{DATE}', DATE) LOG_PATH = LOG_PATH.replace('{DATE}', DATE) CHECKPOINT_PATH = CHECKPOINT_PATH.replace('{DATE}', DATE) if not hdfs.exists(CHECKPOINT_PATH): hdfs.write(CHECKPOINT_PATH, 'FINISH_DATE|FILE_LOCATION') que = redis.Redis(host=REDIS_URL, port=6379) isEmpty = False while not isEmpty: task = que.blpop(QUE_NAME, timeout=1) if task == None: isEmpty = True print("EMPTY QUEUE") else: handleTask(parseTask(task), DATE) que.client_kill_filter(_id=que.client_id())
DATE = hdfs.readFileAsString(RUN_CONTROL_PATH) if DATE.endswith('\n'): DATE = DATE[:-1] FILE_NAME = 'CAMEO.{type}.txt' URL = 'https://www.gdeltproject.org/data/lookups/CAMEO.{type}.txt' TYPES = ['type', 'knowngroup', 'ethnic', 'religion', 'eventcodes'] hdfsPath = '/data/gdelt/' + str(DATE) + '/cameo/' for type in TYPES: path = hdfsPath + FILE_NAME.replace('{type}', type) if hdfs.exists(path): continue cameoResponse = urllib2.urlopen(URL.replace('{type}', type)) tmp = cameoResponse.read().split('\n')[1:] cameoContent = '\n'.join(tmp) hdfs.write(path, cameoContent) def getCountries(): URL = 'https://raw.githubusercontent.com/mysociety/gaze/master/data/fips-10-4-to-iso-country-codes.csv' path = hdfsPath + FILE_NAME.replace('{type}', 'country') if hdfs.exists(path): return cameoResponse = urllib2.urlopen(URL) cameoContent = cameoResponse.read() countries = [] for line in cameoContent.split('\n')[1:]: splitted = line.split(',') if len(splitted) == 3: countries.append(splitted[0] + '\t' + splitted[2]) hdfs.write(path, '\n'.join(countries))