def setup_directories(): config = config_utils.get_app_config() if not os.path.exists(config["app"]["data_directory"]): data_directory = config["app"]["data_directory"] data_directory.replace('\\', os.sep).replace('/', os.sep) os.makedirs(data_directory) if not os.path.exists(config["logging"]["log_directory"]): log_directory = config["logging"]["log_directory"] log_directory.replace('\\', os.sep).replace('/', os.sep) os.makedirs(log_directory) if not os.path.exists(config["gdelt"]["in_process_csv_directory"]): in_process_csv_directory = config["gdelt"]["in_process_csv_directory"] in_process_csv_directory.replace('\\', os.sep).replace('/', os.sep) os.makedirs(in_process_csv_directory) if not os.path.exists(config["gdelt"]["processed_csv_directory"]): processed_csv_directory = config["gdelt"]["processed_csv_directory"] processed_csv_directory.replace('\\', os.sep).replace('/', os.sep) os.makedirs(processed_csv_directory) if not os.path.exists(config["app"]["json_directory"]): json_directory = config["app"]["json_directory"] json_directory.replace('\\', os.sep).replace('/', os.sep) os.makedirs(json_directory) if not os.path.exists(config["app"]["ib_directory"]): ib_directory = config["app"]["ib_directory"] ib_directory.replace('\\', os.sep).replace('/', os.sep) os.makedirs(ib_directory) if not os.path.exists(config["app"]["xml_directory"]): xml_directory = config["app"]["xml_directory"] xml_directory.replace('\\', os.sep).replace('/', os.sep) os.makedirs(xml_directory)
def get_xml_tree(list_of_events): logger.info("GENERATING TREE ...") config = config_utils.get_app_config() root_element = ElementTree.Element('ns2:opsdashboard') root_element.set("xmlns:ns2", "http://www.asd.qwe.rt/sdf") routing_info = ElementTree.SubElement(root_element, 'routinginfo') sender = ElementTree.SubElement(routing_info, 'sender') sender.text = 'OA_FS_SENDER' recipient = ElementTree.SubElement(routing_info, 'recipient') recipient.text = 'FN_FS_Receiver_1' priority = ElementTree.SubElement(routing_info, 'priority') priority.text = 'normal' template = ElementTree.SubElement(routing_info, 'template') template.text = 'Events.xsd' # checksum = ET.SubElement(root_element, 'checksum') ts = time.time() created_datetime = datetime.datetime.fromtimestamp(ts).strftime( '%Y-%m-%d %H:%M:%S') timestamp = ElementTree.SubElement(root_element, 'timestamp') timestamp.text = created_datetime count = ElementTree.SubElement(root_element, 'count') count.text = str(len(list_of_events)) keywords = ElementTree.SubElement(root_element, 'keywords') keyword = ElementTree.SubElement(keywords, 'keyword') keyword.text = '*' events = ElementTree.SubElement(root_element, 'events') for event_object in list_of_events: event_node = ElementTree.SubElement(events, 'event') title = ElementTree.SubElement(event_node, 'title') title.text = event_object["title"] the_content = " " if len(event_object["content"]) > 0: the_content = event_object["content"] content = ElementTree.SubElement(event_node, 'content') content.text = the_content if event_object["content_paragraphs"] is not None: keywords_of_event = ElementTree.SubElement(event_node, 'content_paragraphs') for paragraph in event_object["content_paragraphs"]: keyword_of_event = ElementTree.SubElement( keywords_of_event, 'paragraph') keyword_of_event.text = paragraph if event_object["hit_list"] is not None: keywords_of_event = ElementTree.SubElement(event_node, 'keywords') for word in event_object["hit_list"]: keyword_of_event = ElementTree.SubElement( keywords_of_event, 'keyword') keyword_of_event.text = word if event_object["probable_event_dates"] is not None: keywords_of_event = ElementTree.SubElement(event_node, 'probable_event_dates') for word in event_object["probable_event_dates"]: keyword_of_event = ElementTree.SubElement( keywords_of_event, 'probable_event_date') keyword_of_event.text = word country = ElementTree.SubElement(event_node, 'country') country.text = event_object["country"] lat = ElementTree.SubElement(event_node, 'lat') lat.text = event_object["lat"] lng = ElementTree.SubElement(event_node, 'lng') lng.text = event_object["lng"] source = ElementTree.SubElement(event_node, 'source') source.text = event_object["source"] created_datetime = ElementTree.SubElement(event_node, 'created_datetime') created_datetime.text = event_object["created_datetime"] categories = ElementTree.SubElement(event_node, 'categories') for category_object in event_object["categories"]: category_node = ElementTree.SubElement(categories, 'category') category_node.text = category_object["category"] authors = ElementTree.SubElement(event_node, 'authors') for author_object in event_object["authors"]: author_node = ElementTree.SubElement(authors, 'author') author_node.text = author_object ts = time.time() created_datetime = datetime.datetime.fromtimestamp(ts).strftime( '%Y_%m_%d_%H%M%S') xml_str = minidom.parseString( ElementTree.tostring(root_element)).toprettyxml() with open( config["app"]["xml_directory"] + os.sep + created_datetime + "_for_ib.xml", "wb") as f: f.write(xml_str.encode('utf-8')) f.close() logger.info("DONE: GENERATING TREE ...")
def get_json(list_of_events): logger.info("Generating Elasticsearch JSON for bulk indexing...") es_json_list = list() ts = time.time() config = config_utils.get_app_config() index_name = config["elasticsearch"]["events_index_name"] index_type = config["elasticsearch"]["events_index_type"] created_datetime = datetime.datetime.fromtimestamp(ts).strftime( '%Y_%m_%d_%H%M%S') json_file_path = config["app"][ "json_directory"] + os.sep + created_datetime + "_for_es.json" csv_file_path = config["app"][ "ib_directory"] + os.sep + created_datetime + "_for_ib.csv" logger.info(json_file_path) logger.info(csv_file_path) # csv file to carry the json across the ib for event_object in list_of_events: categories = list() for c in event_object["categories"]: categories.append(c["category"]) if event_object["hit_list"] is not None: for c in event_object["hit_list"]: categories.append(c) lng_lat = [ float(event_object["lng"]), float(event_object["lat"]) ] # each event should hold only 1 location, create multiple events if the same event is held at # other places at the same time countries_list = list() countries_list.append(event_object["country"]) new_event_object = { "created_date_time": event_object["created_datetime"], "location": lng_lat, "source": event_object["source"], "probable_event_dates": event_object["probable_event_dates"], "categories": categories, "countries": countries_list, "title": event_object["title"], "content": event_object["content"], "content_paragraphs": event_object["content_paragraphs"], "authors": event_object["authors"] } es_json_list.append(new_event_object) # By default json is the output with open(json_file_path, 'a') as json_file: json.dump( { "index": { "_index": index_name, "_type": index_type, "_id": generate_id(event_object["title"]) } }, json_file) json_file.write("\n") json.dump(new_event_object, json_file) json_file.write("\n\n") json_file.close() with open(csv_file_path, 'a') as csv_file: with open(json_file_path, 'r') as the_file: csv_file.write(the_file.read()) the_file.close() csv_file.close() logger.info("Completed generating Elasticsearch JSON for bulk indexing")
def main(): config = config_utils.get_app_config() data_directory_path = config["app"]["data_directory"] logs_directory_path = config["logging"]["log_directory"] shutil.rmtree(data_directory_path) shutil.rmtree(logs_directory_path)
import zipfile from datetime import timedelta from os.path import isfile, join import datefinder import requests from bs4 import BeautifulSoup from goose3 import Goose import App from gdelt_countries_mapping import countries_mapping from gdelt_events_mapping import event_codes_mapping from gdelt_headers import headers from utils import config_utils, events_utils config = config_utils.get_app_config() browser_headers = config["gdelt"]["browser"]["headers"] browser_timeout = config["gdelt"]["browser"]["time_out"] base_codes_we_want = config["gdelt"]["event_base_codes_we_want"] event_codes_to_exclude = config["gdelt"]["event_codes_to_exclude"] countries_we_want = config["gdelt"]["countries_we_want"] keywords_we_want = config["gdelt"]["keywords_we_want"] is_delta_crawl = config["gdelt"]["is_delta_crawl"] max_urls_to_crawl = config["gdelt"]["max_csv_urls_to_crawl"] month_abbreviations = config["gdelt"]["month_abbreviations"] months_of_year = config["gdelt"]["months_of_year"] in_process_csv_directory = ( config["gdelt"]["in_process_csv_directory"]).replace('\\', os.sep).replace( '/', os.sep)
def run(): logger.info("Running GDACS script") config = config_utils.get_app_config() rss_24h_feed_url = config["gdacs"]["rss_24h_feed_url"] try: if config["proxy"]["enabled"].lower() == "true": proxy_handler = urllib.request.ProxyHandler({ "http": config["proxy"]["http_ip_port"], "https": config["proxy"]["https_ip_port"] }) logger.info("Added proxy handler") gdacs_feed = feedparser.parse(rss_24h_feed_url, handlers=[proxy_handler]) else: gdacs_feed = feedparser.parse(rss_24h_feed_url) logger.info('Getting disasters from GDACS ...') items = gdacs_feed['entries'] events_list = list() logger.info( "Number of disasters(s) from GDACS 24 hours rss feed to process: {}" .format(len(items))) num_errors = 0 for i in range(len(items)): try: logger.info( "Processing #{} disaster fromm GDACS feed ...".format(i + 1)) item = items[i] event_date = item["published"] logger.debug(event_date) # Mon, 26 Nov 2018 01:19:41 GMT event_date = datetime.datetime.strptime( event_date, '%a, %d %b %Y %H:%M:%S %Z').strftime('%Y%m%d') logger.debug(event_date) title = item["title"] description = item["description"] content_paragraph_list = list() content_paragraph_list.append(description) lat = item["geo_lat"] lng = item["geo_long"] country = item["gdacs_country"] event_type = item["gdacs_eventtype"] if event_type in event_types: event_type = event_types[event_type] else: event_type = "445" source = item["link"] ts = time.time() created_datetime = datetime.datetime.fromtimestamp( ts).strftime('%Y-%m-%d %H:%M:%S') probable_event_date_list = list() probable_event_date_list.append( datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d')) category_list = list() event_str = event_codes_mapping[event_type] category_list.append({"category": event_str}) author_list = list() author_list.append( "Global Disaster Alerts & Coordination System (GDAC)") event_object = events_utils.generate_event( title, description, content_paragraph_list, source, created_datetime, probable_event_date_list, country, lat, lng, category_list, author_list) logger.info( "Completed processing #{} disaster fromm GDACS feed". format(i + 1)) events_list.append(event_object) logger.info('Currently {} event(s) built'.format( len(events_list))) except Exception: logger.exception( "Failed to process #{} disaster fromm GDACS feed ...". format(i + 1)) num_errors += 1 continue events_utils.get_json(events_list) if config["gdelt"]["generate_xml_files"]: events_utils.get_xml_tree(events_list) logger.info('\n\n#### Summary of GDACS events ###') logger.info('Number of disasters from GDACS = {}'.format(len(items))) logger.info('Number of events generated from GDACS = {}'.format( len(events_list))) logger.info('Number of erroneous disasters= {}\n'.format(num_errors)) except Exception: logger.exception("Failed to capture RSS feed from GDACS")