def process_zip_file(): csv = DATA_DIR + '20200215224500.gkg.csv' logging.info("Reading " + csv) df = pd.read_csv(csv, header=0, sep='\t', names=GKG_COLUMN_NAMES, encoding='unicode_escape') logging.info("gkg df shape " + str(df.shape)) # append relevant gkg lines to army df # Grab rows that contain 'army' in the url column # fdf = df[(df['V1ORGANIZATIONS'].str.contains(ARMY_REGEX, case=False) == True) | # (df['V2ENHANCEDORGANIZATIONS'].str.contains( # ARMY_REGEX, case=False) == True) # ] # logging.info("fdf shape " + str(fdf.shape)) #print(str(fdf.head())) # update the processed files list # with open(FILES_PROCESSED_LIST, "a") as f: # f.write(zip_file_url + "\n") # delete the zip file # if os.path.exists(zip_file): # logging.info("removing " + zip_file) # os.remove(zip_file) # else: # logging.info(zip_file + " missing.") print(df.head())
def load_files_df(df_file): try: logging.info("Reading " + df_file) df = pd.read_table(df_file, sep=' ', usecols=[0,1,2], names=['id', 'checksum', 'url'], header=None) except Exception as e: logging.critical("Not parsed: " + df_file + "\n" + str(e)) sys.exit() return df
def download(url): logging.info("Downloading " + url) file_name = fname_from_url(url) # open in binary mode with open(file_name, "wb") as file: # get request response = get(url) # write to file file.write(response.content)
def main(): NUM_TABLES = 1 TEMPLATE_FILE = HTML_DIR + "gchart_template.html" HISTOGRAM_FILE1 = HISTOGRAM_DIR + "LocationsHistogram.csv" TITLE = "LOCATIONS" OUTFILE = HTML_DIR + "locations.html" TABLE1 = "V1LOCATIONS counts" # Read histogram into list logging.info("reading " + HISTOGRAM_FILE1) with open(HISTOGRAM_FILE1) as f: gcam_lines = f.read().splitlines() # build histo dict hist_dict = {} for line in gcam_lines: entries = line.split("\t") location = entries[0].strip() score = int(entries[1].strip()) hist_dict[location] = score # build the datatable dt = '["Feature", "Score"],\n' for key, value in hist_dict.items(): dt += f'["{key}", {value}],\n' print(dt) # build the title line title_option = f'title: "{TABLE1}",\n' width_option = 'width: 600,\n' height_option = f'height: 100000,\n' bar_option = f'bar: 150,' options = title_option + width_option + height_option + bar_option print(options) # read the template with open(TEMPLATE_FILE) as f: html = f.read() html = html.replace("//%DATA_TABLE", dt, 1) html = html.replace("//%OPTIONS", options, 1) logging.info("writing " + OUTFILE) with open(OUTFILE, 'w') as f: f.write(html)
def process_zip_file(zip_file_url): zip_file = fname_from_url(zip_file_url) try: logging.info("Reading " + zip_file) #requires unicode escape for some files df = pd.read_csv(zip_file, compression='zip', header=0, sep='\t', names=GKG_COLUMN_NAMES, encoding='unicode_escape') logging.info("gkg df shape " + str(df.shape)) # append relevant gkg lines to army df # Grab rows that contain 'army' in the url column fdf = df[( df['V1ORGANIZATIONS'].str.contains(ARMY_REGEX, case=False) == True) | (df['V2ENHANCEDORGANIZATIONS'].str. contains(ARMY_REGEX, case=False) == True)] logging.info("fdf shape " + str(fdf.shape)) #print(str(fdf.head())) # update the processed files list with open(FILES_PROCESSED_LIST, "a") as f: f.write(zip_file_url + "\n") # delete the zip file if os.path.exists(zip_file): logging.info("removing " + zip_file) os.remove(zip_file) else: logging.info(zip_file + " missing.") except Exception as e: logging.error("Problem reading " + zip_file) return fdf
def build_gkg_urls(master_list): logging.info("Loading GDELT Master File List") gdelt_list_df = load_files_df(master_list) logging.info("gdelt_list_df " + str(gdelt_list_df.shape)) regex = '202002.{9}gkg' #regex = '2020021001.{5}gkg' logging.info("filtering on " + regex) # Grab rows that contain 'gkg' in the url column gdelt_list_df = gdelt_list_df[gdelt_list_df['url'].str.contains( regex) == True] # Save the url column only gdelt_list_df = gdelt_list_df['url'] logging.info("filtered list length: " + str(gdelt_list_df.shape[0])) # write the gkg file with open(GKG_FILE_LIST, 'w') as f: logging.info("writing " + GKG_FILE_LIST) gdelt_list_df.to_csv(f, index=False, sep="\t", header=False)
def main(): file_list = glob.glob(DATA_DIR + "army_gkg*.csv") # create blank full df df = pd.DataFrame(columns=GKG_COLUMN_NAMES) for f in file_list: print(f) # read the file into temp df tdf = pd.read_csv(f, header=0, sep='\t', names=GKG_COLUMN_NAMES) tdf['V2.1DATE'] = tdf['V2.1DATE'].astype(str) # append temp df to full df df = df.append(tdf, ignore_index=True) # create a datetime column on the full df df['ymd'] = df.apply (lambda row: make_date(row), axis=1) df['Datetime'] = pd.to_datetime(df['ymd'], format='%Y-%m-%d') #df = df.set_index(pd.DatetimeIndex(df['Datetime']), drop=True) df = df.drop(['ymd'], axis=1) #print(df.head(500)) # group full df by day daygroups = df.groupby(['Datetime']) logging.info("Groups: " + (str(daygroups.describe()))) # for each group write the output file (do not write the date or the index) for name, group in daygroups: tname = str(name) out_fname = "Army_GKG_by_day_" + tname[:10] + ".csv" logging.info("Writing group to " + out_fname) group.to_csv(out_fname, index=False, header=False, sep="\t")
def main(): ans = "" while (ans != 'y') and (ans != 'n'): ans = input("Download new GDELT Master File List? [y/n] ") ans = ans.lower().strip() master_url = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt" file_name = DATA_DIR + fname_from_url(master_url) if ans == 'y': logging.info("Downloading master file list.") with open(file_name, "wb") as file: # get request response = get(master_url) # write to file file.write(response.content) else: logging.info("Using existing master file list.") build_gkg_urls(file_name)
def main(): # create blank full df df = pd.DataFrame(columns=GKG_COLUMN_NAMES) file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv") for f in file_list: logging.info("reading" + f) # read the file into temp df tdf = pd.read_csv(f, header=0, sep='\t', names=GKG_COLUMN_NAMES, index_col=False) # append temp df to full df df = df.append(tdf, ignore_index=True) logging.info("consolidated df shape: " + str(df.shape)) # build a list of the column items_list = df['V2ENHANCEDPERSONS'].tolist() #print(f"themes list length: " + str(len(themes_list))) #print(themes_list[10]) items_dict = {} for line in items_list: # line has the content from the GKG cell l = line.strip() items = l.split(";") # list of top level entities for i in items: i = i.strip() if len(i) > 1: ii = i.split(',')[0] # subfield of entity ii = ii.strip() if len(ii) > 1: if ii in items_dict: items_dict[ii] += 1 else: items_dict[ii] = 1 # add the V2 Themes #print(str(themes_dict)) items_hist_df = pd.DataFrame.from_dict(items_dict, orient='index') items_hist_df = items_hist_df.sort_values(by=0, ascending=False) #print(str(themes_hist_df.head(500))) outfile = ARMY_GKG_DAILY_DIR + "PersonsV2Histogram.csv" logging.info("writing " + outfile) items_hist_df.to_csv(outfile, header=False, sep="\t")
def main(): # create blank full df df = pd.DataFrame(columns=GKG_COLUMN_NAMES) file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv") for f in file_list: logging.info("reading" + f) # read the file into temp df tdf = pd.read_csv(f, header=0, sep='\t', names=GKG_COLUMN_NAMES, index_col=False) # append temp df to full df df = df.append(tdf, ignore_index=True) logging.info("consolidated df shape: " + str(df.shape)) # build a list of the V1THEMES column themes_list = df['V1THEMES'].tolist() #print(f"themes list length: " + str(len(themes_list))) #print(themes_list[10]) themes_dict = {} for line in themes_list: l = line.strip() themes = l.split(";") for t in themes: t = t.strip() if len(t) > 1: tn = t.split(',')[0] if tn in themes_dict: themes_dict[tn] += 1 else: themes_dict[tn] = 1 # add the V2 Themes #print(str(themes_dict)) themes_hist_df = pd.DataFrame.from_dict(themes_dict, orient='index') themes_hist_df = themes_hist_df.sort_values(by=0, ascending=False) #print(str(themes_hist_df.head(500))) outfile = ARMY_GKG_DAILY_DIR + "ThemesHistogram.csv" logging.info("writing " + outfile) themes_hist_df.to_csv(outfile, header=False, sep="\t")
def build_url_queue(use_failed_files): if use_failed_files: input_file_list = FAILED_FILES_LIST else: input_file_list = GKG_FILE_LIST # Read the gkg file list gkg_list = [] try: with open(input_file_list) as f: gkg_list = f.read().splitlines() except EnvironmentError: logging.info(input_file_list + " not read") # Read the processed files list if it exists logging.info("reading processed files list") processed_files = [] try: with open(FILES_PROCESSED_LIST) as f: processed_files = f.read().splitlines() f.close() except EnvironmentError: logging.info("processed files list not read") logging.info("processed files length: " + str(len(processed_files))) # Build the queue by comparing the processed files list with the gkg list files_queue = [] for url in gkg_list: if url not in processed_files: files_queue.append(url) # delete the failed files list if os.path.exists(FAILED_FILES_LIST): os.remove(FAILED_FILES_LIST) return files_queue
def main(): # create blank full df df = pd.DataFrame(columns=GKG_COLUMN_NAMES) file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv") for f in file_list: logging.info("reading" + f) # read the file into temp df tdf = pd.read_csv(f, header=0, sep='\t', names=GKG_COLUMN_NAMES, index_col=False) # append temp df to full df df = df.append(tdf, ignore_index=True) logging.info("consolidated df shape: " + str(df.shape)) # build a list of the V1PERSONS column item_list = df['V1PERSONS'].tolist() items_dict = {} for line in item_list: l = line.strip() items = l.split(";") for i in items: i = i.strip() if len(i) > 1: if i in items_dict: items_dict[i] += 1 else: items_dict[i] = 1 #print(str(themes_dict)) items_hist_df = pd.DataFrame.from_dict(items_dict, orient='index') items_hist_df = items_hist_df.sort_values(by=0, ascending=False) #print(str(themes_hist_df.head(500))) outfile = ARMY_GKG_DAILY_DIR + "PersonsHistogram.csv" logging.info("writing " + outfile) items_hist_df.to_csv(outfile, header=False, sep="\t")
def main(): # create blank full df df = pd.DataFrame(columns=GKG_COLUMN_NAMES) file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv") #file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day_2020-02-16.csv") for f in file_list: logging.info("reading" + f) # read the file into temp df tdf = pd.read_csv(f, header=0, sep='\t', names=GKG_COLUMN_NAMES, index_col=False) # append temp df to full df df = df.append(tdf, ignore_index=True) logging.info("consolidated df shape: " + str(df.shape)) # -------------------------------- DEBUG ------------------------------ # persons_rows = [] # persons_rows.append("n1; n2; n3; n4") # persons_rows.append("n1; n3; n5; n6") gkg_data = [ [ 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'n1; n2; n3; n4', 'col13', 'col14', 'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22', 'col23', 'col24', 'col25', 'col26', 'col27' ], [ 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'n1; n3; n5; n6', 'col13', 'col14', 'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22', 'col23', 'col24', 'col25', 'col26', 'col27' ], ] # create blank full df #df = pd.DataFrame(gkg_data, columns=GKG_COLUMN_NAMES) # ------ create person nodes (TODO this can be made faster) -------- persons_rows = df['V1PERSONS'].tolist() persons_dict = {} # assign person IDs pid = 0 persons_set = set() for line in persons_rows: persons_set = get_persons_set(line) for person in persons_set: if person in persons_dict.keys(): values = persons_dict[person] node_size = values[0] person_id = values[1] persons_dict[person] = [node_size + 1, person_id] else: persons_dict[person] = [1, pid] pid += 1 # convert, sort and write person nodes file plist = [] for key, value in persons_dict.items(): pl = [value[1], key, value[0]] # id, label, nodesize plist.append(pl) node_list_df = pd.DataFrame(plist, columns=['id', 'label', 'value']) node_list_df = node_list_df.sort_values(by=['value'], ascending=False) #print(str(node_list_df.head(20))) nodesfile = GRAPH_DIR + "PersonsNodes.csv" logging.info("writing " + nodesfile) node_list_df.to_csv(nodesfile, header=True, index=False, sep=",") # -------------------- Build edge list ------------------------------------ # make persons column into set df['V1PERSONS'] = df.apply(lambda row: make_persons_pairs(row.V1PERSONS), axis=1) # pdf = df[['V1PERSONS']].copy() # print(pdf.head()) logging.info('creating pairs') pairs = [] persons_list = df['V1PERSONS'].tolist() for row in persons_list: for pair in row: pair_string = str(pair[0]) + ", " + str(pair[1]) pairs.append(pair_string) # make one column with all the pairs and value counts logging.info('building edge df') pdf = pd.DataFrame() pdf["values"] = pairs vc = pdf["values"].value_counts() edge_df = pd.DataFrame(vc) #print("edge df: ") #print(edge_df.head(20)) #print(str(df.head())) #print(str(df['V1PERSONS'])) edgefile = GRAPH_DIR + "PersonsEdgeListSorted.csv" logging.info("writing " + edgefile) edge_df.to_csv(edgefile, header=False, index=True, sep=",") # Write the edge list using node ids instead of names edge_df['id1'] = edge_df.apply( lambda row: names_to_ids(persons_dict, row, 0), axis=1) edge_df['id2'] = edge_df.apply( lambda row: names_to_ids(persons_dict, row, 1), axis=1) print(edge_df) edgeidfile = GRAPH_DIR + "PersonsEdgeIDs.csv" logging.info("writing " + edgeidfile) edge_df.to_csv(edgeidfile, columns=['id1', 'id2', 'values'], header=True, index=False, sep=",")
def main(): TITLE = "GKG Persons" entity_name = "persons" search_subtitle = "Filtered on: Organization = US Army (5464 entries)" date_subtitle = "1-FEB-2020 through 16-FEB-2020" NUM_TABLES = 2 HISTOGRAM_FILE1 = HISTOGRAM_DIR + "PersonsHistogram.csv" TABLE1_TITLE = "V1PERSONS occurrences" HISTOGRAM_FILE2 = HISTOGRAM_DIR + "PersonsV2Histogram.csv" TABLE2_TITLE = "V2ENHANCEDPERSONS occurrences" OUTFILE = HTML_DIR + "persons.html" MAX_BAR_LENGTH = 200 # HTML Start html = write_header(TITLE) html.append(f'<h1> {TITLE}</h1>') html.append(f'<h2> {date_subtitle}</h2>') html.append(f'<h2> {search_subtitle}</h2>') html.append(' <div class="flexrow">') # TABLE 1 # Read histogram into list logging.info("reading " + HISTOGRAM_FILE1) with open(HISTOGRAM_FILE1) as f: lines = f.read().splitlines() # get the highest score pair = lines[0].split("\t") high_score = int(pair[1].strip()) # build histo1 dict hist_dict = {} for line in lines: entries = line.split("\t") location = entries[0].strip() score = int(entries[1].strip()) hist_dict[location] = score html.append(' <div class="flexcol"> <table>') t1caption = TABLE1_TITLE + f" ({len(hist_dict)} {entity_name})" html.append(f' <caption>{t1caption}</caption>') # one table row for each dictionary entry for key, value in hist_dict.items(): barlength = (value / high_score) * MAX_BAR_LENGTH barlength = math.ceil(barlength) left = barlength + 5 html.append(' <tr>') html.append(' <td>') html.append(f' <div class="feature">{key}</div>') html.append(' </td>') html.append(' <td>') html.append( f' <div class="score-bar" style="width:{barlength}px;">') html.append( f' <p class="score" style="left: {left}px;">{value}</p></div> ' ) html.append(' </td>') html.append(' </tr>') html.append(' </table> </div>') # TABLE 2 # Read histogram into list logging.info("reading " + HISTOGRAM_FILE2) with open(HISTOGRAM_FILE2) as f: lines = f.read().splitlines() # get the highest score pair = lines[0].split("\t") high_score = int(pair[1].strip()) # build histo dict hist_dict = {} for line in lines: entries = line.split("\t") location = entries[0].strip() score = int(entries[1].strip()) hist_dict[location] = score t2caption = TABLE2_TITLE + f" ({len(hist_dict)} {entity_name})" html.append(' <div class="flexcol"> <table>') html.append(f' <caption>{t2caption}</caption>') # one table row for each dictionary entry for key, value in hist_dict.items(): barlength = (value / high_score) * MAX_BAR_LENGTH barlength = math.ceil(barlength) left = barlength + 5 html.append(' <tr>') html.append(' <td>') html.append(f' <div class="feature">{key}</div>') html.append(' </td>') html.append(' <td>') html.append( f' <div class="score-bar" style="width:{barlength}px;">') html.append( f' <p class="score" style="left: {left}px;">{value}</p></div> ' ) html.append(' </td>') html.append(' </tr>') html.append(' </table> </div>') html.append(' </div>') # f,exrow # CLOSING HTML footer = write_footer() for line in footer: html.append(line) logging.info("writing " + OUTFILE) with open(OUTFILE, 'w') as f: for line in html: f.writelines(line + "\n")
def main(): # create blank full df df = pd.DataFrame(columns=GKG_COLUMN_NAMES) file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv") #file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day_2020-02-16.csv") for f in file_list: logging.info("reading" + f) # read the file into temp df tdf = pd.read_csv(f, header=0, sep='\t', names=GKG_COLUMN_NAMES, index_col=False) # append temp df to full df df = df.append(tdf, ignore_index=True) logging.info("consolidated df shape: " + str(df.shape)) #-------------------------------- DEBUG ------------------------------ persons_rows = [] persons_rows.append("n1; n2; n3; n4") persons_rows.append("n1; n3; n5; n6") gkg_data = [ [ 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'n1; n2; n3; n4', 'col13', 'col14', 'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22', 'col23', 'col24', 'col25', 'col26', 'col27' ], [ 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'n1; n3; n5; n6', 'col13', 'col14', 'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22', 'col23', 'col24', 'col25', 'col26', 'col27' ], [ 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'n6; n7', 'col13', 'col14', 'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22', 'col23', 'col24', 'col25', 'col26', 'col27' ], [ 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'n7; n8', 'col13', 'col14', 'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22', 'col23', 'col24', 'col25', 'col26', 'col27' ], [ 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'n8; n9', 'col13', 'col14', 'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22', 'col23', 'col24', 'col25', 'col26', 'col27' ], [ 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'n3; n10', 'col13', 'col14', 'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22', 'col23', 'col24', 'col25', 'col26', 'col27' ], ] # create blank full df #df = pd.DataFrame(gkg_data, columns=GKG_COLUMN_NAMES) # ------ create person nodes (TODO this can be made faster) -------- persons_rows = df['V1PERSONS'].tolist() persons_dict = {} # assign person IDs pid = 0 persons_set = set() for line in persons_rows: persons_set = get_persons_set(line) for person in persons_set: if person in persons_dict.keys(): values = persons_dict[person] node_size = values[0] person_id = values[1] persons_dict[person] = [node_size + 1, person_id] else: persons_dict[person] = [1, pid] pid += 1 # convert, sort and write person nodes file plist = [] for key, value in persons_dict.items(): pl = [value[1], key, value[0]] # id, label, nodesize plist.append(pl) node_list_df = pd.DataFrame(plist, columns=['id', 'label', 'value']) node_list_df = node_list_df.sort_values(by=['value'], ascending=False) #print(str(node_list_df.head(20))) nodesfile = GRAPH_DIR + "PersonsNodes.csv" logging.info("writing " + nodesfile) node_list_df.to_csv(nodesfile, header=True, index=False, sep=",") # -------------------- Build edge list ------------------------------------ # make persons column into set df['V1PERSONS'] = df.apply(lambda row: make_persons_pairs(row.V1PERSONS), axis=1) # pdf = df[['V1PERSONS']].copy() # print(pdf.head()) logging.info('creating pairs') pairs = [] persons_list = df['V1PERSONS'].tolist() for row in persons_list: for pair in row: pair_string = str(pair[0]) + ", " + str(pair[1]) pairs.append(pair_string) # make one column with all the pairs and value counts logging.info('building edge df') pdf = pd.DataFrame() pdf["values"] = pairs vc = pdf["values"].value_counts() edge_df = pd.DataFrame(vc) #print("edge df: ") #print(edge_df.head(20)) #print(str(df.head())) #print(str(df['V1PERSONS'])) edgefile = GRAPH_DIR + "PersonsEdgeListSorted.csv" logging.info("writing " + edgefile) edge_df = edge_df.sort_values(by=['values'], ascending=False) edge_df.to_csv(edgefile, header=False, index=True, sep=",") # Add node ids as columns edge_df['id1'] = edge_df.apply( lambda row: names_to_ids(persons_dict, row, 0), axis=1) edge_df['id2'] = edge_df.apply( lambda row: names_to_ids(persons_dict, row, 1), axis=1) #print(edge_df) edgeidfile = GRAPH_DIR + "PersonsEdgeIDs.csv" logging.info("writing " + edgeidfile) # write ids edge list edge_df = edge_df.sort_values(by=['values'], ascending=False) edge_df.to_csv(edgeidfile, columns=['id1', 'id2', 'values'], header=True, index=False, sep=",") # ----------- Build graphs for top-N node values (number of appearances in doc set) logging.info("Building Top-N Lists") TOP_N = 10 # LINK_VALUE_CUTOFF = 50 shortlist_df = node_list_df[:TOP_N] # keep edge df rows where one of the ids is in the short list short_ids = shortlist_df['id'].apply(str).tolist() #print(str(short_ids)) edges_short = edge_df.loc[(edge_df['id1'].isin(short_ids)) | (edge_df['id2'].isin(short_ids))] edges_short = edges_short[edges_short['values'] > LINK_VALUE_CUTOFF] # write the shortlist nodesfile = GRAPH_DIR + "PersonsEdgeListTopN.csv" logging.info("writing " + nodesfile) edges_short = edges_short.sort_values(by=['values'], ascending=False) edges_short.to_csv(nodesfile, header=True, index=False, sep=",", columns=['id1', 'id2', 'values']) # add the connected nodes to the persons shortlist # collect new ids logging.info("adding adjacent nodes to top N") has_new_node = edges_short.loc[~edges_short['id1'].isin(short_ids) | ~edges_short['id2'].isin(short_ids)] # keep high-link-strength edges only has_new_node = has_new_node[has_new_node['values'] > LINK_VALUE_CUTOFF] #print(has_new_node) ids_set = set(has_new_node['id1'].tolist() + has_new_node['id2'].tolist()) #print(ids_set) #print(node_list_df) logging.info("building adjacent nodes df") plist = [] idx = 0 for nodeid in ids_set: idx += 1 if nodeid in shortlist_df['id'].apply(str).tolist(): pass else: label = node_list_df[node_list_df['id'].apply(str) == nodeid]['label'].item() value = node_list_df[node_list_df['id'].apply(str) == nodeid]['value'].item() pl = [nodeid, label, value] plist.append(pl) if (idx % 100) == 0: print("processed rows: " + str(idx)) adjacent_nodes = pd.DataFrame(plist, columns=['id', 'label', 'value']) #print(adjacent_nodes) shortlist_df = shortlist_df.append(adjacent_nodes) # write the short nodes list to file nodesfile = GRAPH_DIR + "PersonsNodesTopN.csv" logging.info("writing " + nodesfile) shortlist_df = shortlist_df.sort_values(by=['value'], ascending=False) shortlist_df.to_csv(nodesfile, header=True, index=False, sep=",", columns=['id', 'label', 'value'])
def main(): # create blank full df df = pd.DataFrame(columns=GKG_COLUMN_NAMES) file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv") for f in file_list: logging.info("reading" + f) # read the file into temp df tdf = pd.read_csv(f, header=0, sep='\t', names=GKG_COLUMN_NAMES, index_col=False) # append temp df to full df df = df.append(tdf, ignore_index=True) logging.info("consolidated df shape: " + str(df.shape)) # build a list of the column items_list = df['V2EXTRASXML'].tolist() #print(f"themes list length: " + str(len(themes_list))) #print(themes_list[10]) # stop_words with open(STOPWORDS_FILE) as f: stopwords = f.read().splitlines() punctdict = { ".": "", ",": "", "?": "", "#": "", "$": "", "!": "", "&": "", "*": "", "(": "", ")": "", '"': "", ":": "", ";": "" } items_dict = {} tag = "<PAGE_TITLE>" endtag = "</PAGE_TITLE>" table = str.maketrans(dict.fromkeys( string.punctuation)) # for punctuation removal for line in items_list: # line has the content from the GKG cell l = line.strip() if tag in l: title = line[l.find(tag) + 12:l.find(endtag)] #print(title) words = title.split(" ") for w in words: w = w.strip() w = w.translate(table) # remove punctuation if (len(w) > 1) and (w.lower() not in stopwords): if w in items_dict: items_dict[w] += 1 else: items_dict[w] = 1 #print(str(themes_dict)) items_hist_df = pd.DataFrame.from_dict(items_dict, orient='index') items_hist_df = items_hist_df.sort_values(by=0, ascending=False) #print(str(themes_hist_df.head(500))) outfile = ARMY_GKG_DAILY_DIR + "TitleWordsHistogram.csv" logging.info("writing " + outfile) items_hist_df.to_csv(outfile, header=False, sep="\t")
def main(): USE_FAILED_FILES_LIST = False startTime = pd.Timestamp('now') logging.info("ANTS run started at " + str(startTime)) # elapsed time working on the current output file fileTime = pd.Timestamp('now') # add time to output file to prevent overwrites timestr = time.strftime("%Y%m%d-%H%M%S") outfile = OUTPUT_FILE_PRE + timestr + ".csv" # Build the input queue files_queue = build_url_queue(USE_FAILED_FILES_LIST) if len(files_queue) > 0: logging.info(str(len(files_queue)) + " files in queue") else: logging.info("NO UNPROCESSED FILES IN QUEUE") files_processed = 0 army_gkg_events = 0 skipped_files = 0 for zip_file_url in files_queue: # Download a file from the queue try: # UNCOMMENT download(zip_file_url) fdf = process_zip_file(zip_file_url) army_gkg_events += fdf.shape[0] # create a new file after an hour elapsed_time = pd.Timestamp('now') - fileTime if elapsed_time.seconds > 3600: timestr = time.strftime("%Y%m%d-%H%M%S") outfile = OUTPUT_FILE_PRE + timestr + ".csv" fileTime = pd.Timestamp('now') fdf.to_csv(outfile, mode='a', header=False, sep='\t', na_rep=' ', index=False) logging.info("Wrote results to " + outfile) files_processed += 1 logging.info(f'Completed {files_processed} files') logging.info("ARMY GKG EVENTS SO FAR: " + str(army_gkg_events)) except Exception as e: logging.error("Problem processing " + zip_file_url) skipped_files += 1 with open(FAILED_FILES_LIST, "a") as failedfile: failedfile.write(zip_file_url + "\n") # delete the zip file badfile = fname_from_url(zip_file_url) if os.path.exists(badfile): os.remove(badfile) endTime = pd.Timestamp('now') logging.info("ANTS run finished at " + str(endTime)) logging.info("Elapsed time: " + str(endTime - startTime)) logging.info(f"Processed {files_processed} files.") logging.info(f"Skipped {skipped_files} files.") logging.info(f"Found {army_gkg_events} relevant gkg events")
def main(): # create blank full df df = pd.DataFrame(columns=GKG_COLUMN_NAMES) #file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv") file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day_2020-02-16.csv") for f in file_list: logging.info("reading" + f) # read the file into temp df tdf = pd.read_csv(f, header=0, sep='\t', names=GKG_COLUMN_NAMES, index_col=False) # append temp df to full df df = df.append(tdf, ignore_index=True) logging.info("consolidated df shape: " + str(df.shape)) # build a list of the V1PERSONS column persons_rows = df['V1PERSONS'].tolist() # DEBUG # persons_rows = [] # persons_rows.append("n1; n2; n3; n4") # persons_rows.append("n1; n3; n5; n6") # persons_rows.append("n1; n3;") persons_dict = {} # assign person IDs pid = 0 persons_set = set() for line in persons_rows: persons_set = get_persons_set(line) for person in persons_set: if person in persons_dict.keys(): values = persons_dict[person] node_size = values[0] person_id = values[1] persons_dict[person] = [node_size + 1, person_id] else: persons_dict[person] = [1, pid] pid += 1 # convert, sort and write person nodes file plist = [] for key, value in persons_dict.items(): pl = [value[1], key, value[0]] plist.append(pl) node_list_df = pd.DataFrame(plist, columns=['id', 'label', 'value']) node_list_df = node_list_df.sort_values(by=['value'], ascending=False) print(str(node_list_df.head(20))) nodesfile = GRAPH_DIR + "PersonsNodes.csv" logging.info("writing " + nodesfile) node_list_df.to_csv(nodesfile, header=True, index=False, sep=";") # build edge list logging.info("building edge list") big_edge_list = [] rows_processed = 0 for line in persons_rows: doc_edge_list = make_doc_edge_list(persons_dict, line) #write_edge_list(doc_edge_list) # SLOW? big_edge_list = update_edge_list(doc_edge_list, big_edge_list) rows_processed += 1 if (rows_processed % 10) == 0: logging.info("rows processed: " + str(rows_processed)) edge_list_df = pd.DataFrame(big_edge_list, columns=['from', 'to', 'strength']) edge_list_df = edge_list_df.sort_values(by=['strength'], ascending=False) print(str(edge_list_df.head(20))) #print(str(themes_hist_df.head(500))) edgefile = GRAPH_DIR + "PersonsEdgeListSorted.csv" logging.info("writing " + edgefile) edge_list_df.to_csv(edgefile, header=True, index=False, sep=";")
def main(): # create blank full df df = pd.DataFrame(columns=GKG_COLUMN_NAMES) file_list = glob.glob(ARMY_GKG_DAILY_DIR + "Army_GKG_by_day*.csv") for f in file_list: logging.info("reading" + f) # read the file into temp df tdf = pd.read_csv(f, header=0, sep='\t', names=GKG_COLUMN_NAMES, index_col=False) # append temp df to full df df = df.append(tdf, ignore_index=True) logging.info("consolidated df shape: " + str(df.shape)) # build a list of the column items_list = df['V2GCAM'].tolist() # Read CodeBook into dictionary with open(GCAM_CODEBOOK, errors="backslashreplace") as f: gcam_lines = f.read().splitlines() # Build gcam dictionary logging.info("building GCAM dictionary") gcam_dict = {} skip = True # skip line 1 for line in gcam_lines: if skip: skip = False else: line = line.strip() cols = line.split("\t") gcam_dict[cols[0]] = cols[6] items_dict = {} # This will hold the pairs => gcam_code : total_score logging.info("building histogram") for line in items_list: # line has the content from the GKG cell l = line.strip() entries = l.split(",") for entry in entries: entry = entry.strip() code = entry.split(":")[0] score = entry.split(":")[1] if code[0] == "v": if code in items_dict: items_dict[code] += float(score) else: items_dict[code] = float(score) # annotate the gcam codes with their human dimension names labeled_items_dict = {} logging.info("adding dimension labels") for key, value in items_dict.items(): label = gcam_dict[key] labeled_items_dict[key + " " + label] = value #print(str(themes_dict)) logging.info("building dataframe") items_hist_df = pd.DataFrame.from_dict(labeled_items_dict, orient='index') items_hist_df = items_hist_df.sort_values(by=0, ascending=False) #print(str(themes_hist_df.head(500))) outfile = ARMY_GKG_DAILY_DIR + "GCAM_Values_Histogram.csv" logging.info("writing " + outfile) items_hist_df.to_csv(outfile, header=False, sep="\t")
def main(): TITLE = "GKG Persons Graph (Top 19 Entities)" OUTFILE = GRAPH_DIR + "GKG-graph.html" search_subtitle = "Filtered on: Organization = US Army (5464 entries)" date_subtitle = "1-FEB-2020 through 16-FEB-2020" # load nodes and edges node_input_file = GRAPH_DIR + "PersonsNodesTopN.csv" edge_input_file = GRAPH_DIR + "PersonsEdgeListTopN.csv" # nodes with open(node_input_file) as file: lines = list(file) # get node scale max_node_size = 0 for line in lines[1:]: line = line.strip() items = line.split(',') node_val_float = float(items[2].strip()) if node_val_float > max_node_size: max_node_size = node_val_float print("max node size: " + str(max_node_size)) nodes = [] NODE_SCALE = 50 for line in lines[1:]: line = line.strip() items = line.split(',') node_id = items[0].strip() node_value = items[2].strip() node_value = round( (float(node_value) / max_node_size) * NODE_SCALE) node_label = items[1].strip() row = ("{id: " + node_id + ", value: " + str(node_value) + ", label: " + "'" + node_label + "'}") nodes.append(row) #print(str(nodes)) # edges with open(edge_input_file) as file: lines = list(file) # get edge scale max_edge_size = 0 for line in lines[1:]: line = line.strip() items = line.split(',') edge_val_float = float(items[2].strip()) if edge_val_float > max_edge_size: max_edge_size = edge_val_float print("max edge size: " + str(max_edge_size)) edges = [] EDGE_SCALE = 10 for line in lines[1:]: line = line.strip() items = line.split(',') node_from = items[0].strip() node_to = items[1].strip() edge_value = items[2].strip() edge_value = round( (float(edge_value) / max_edge_size) * EDGE_SCALE) row = ("{from: " + node_from + ", to: " + node_to + ", value: " + str(edge_value) + ", title: " + "'" + str(edge_value) + "'}") edges.append(row) #print(str(edges)) # HTML Start html = write_header(TITLE, nodes, edges) html.append('<body onload="draw()">') html.append(f'<h1> {TITLE}</h1>') html.append(f'<h2> {date_subtitle}</h2>') html.append(f'<h2> {search_subtitle}</h2>') html.append('<div id="mynetwork"></div>') # CLOSING HTML footer = write_footer() for line in footer: html.append(line) logging.info("writing " + OUTFILE) with open(OUTFILE, 'w') as f: for line in html: f.writelines(line + "\n")