def test_csv_str_with_ends_with_blank_row_false(self) -> None: csv_processor = CsvProcessor(etk=etk, heading_row=1, content_start_row=2, heading_columns=(1, 3), content_end_row=4, ends_with_blank_row=False, remove_leading_empty_rows=True, required_columns=['text']) test_docs = [ doc.cdr_document for doc in csv_processor.tabular_extractor(table_str=csv_str, dataset='test_set') ] expected_docs = [{ 'text': '1', 'with': '2', 'Polish': '3', 'non-Latin': '4', 'lettes': '5', 'dataset': 'test_set' }, { 'text': 'a', 'with': 'b', 'Polish': 'c', 'non-Latin': 'd', 'lettes': 'e', 'dataset': 'test_set' }] self.assertEqual(test_docs, expected_docs)
def test_dataframe_input_2(self) -> None: csv_processor = CsvProcessor(etk=etk, heading_row=1, content_start_row=9, content_end_row=10) file_path = './etk/unit_tests/ground_truth/masie_4km_allyears_extent_sqkm.csv' data = pd.read_csv(file_path, skiprows=1) test_docs = [ doc.cdr_document for doc in csv_processor.tabular_extractor(dataframe=data, dataset='test_set') ] expected_docs = [{ 'yyyyddd': 2006008, ' (0) Northern_Hemisphere': 13536736.84, ' (1) Beaufort_Sea': 1069710.81, ' (2) Chukchi_Sea': 966006.16, ' (3) East_Siberian_Sea': 1087102.72, ' (4) Laptev_Sea': 897773.37, ' (5) Kara_Sea': 927602.17, ' (6) Barents_Sea': 474574.82, ' (7) Greenland_Sea': 590029.18, ' (8) Baffin_Bay_Gulf_of_St._Lawrence': 1005790.38, ' (9) Canadian_Archipelago': 852715.31, ' (10) Hudson_Bay': 1260779.00, ' (11) Central_Arctic': 3240326.47, ' (12) Bering_Sea': 692832.54, ' (13) Baltic_Sea': 21327.46, ' (14) Sea_of_Okhotsk': 424563.54, ' (15) Yellow_Sea': 14830.45, ' (16) Cook_Inlet': 8202.95, 'dataset': 'test_set' }, { 'yyyyddd': 2006009, ' (0) Northern_Hemisphere': 13536887.64, ' (1) Beaufort_Sea': 1069710.81, ' (2) Chukchi_Sea': 966006.16, ' (3) East_Siberian_Sea': 1087102.72, ' (4) Laptev_Sea': 897773.37, ' (5) Kara_Sea': 927602.17, ' (6) Barents_Sea': 474574.82, ' (7) Greenland_Sea': 590029.18, ' (8) Baffin_Bay_Gulf_of_St._Lawrence': 1005790.38, ' (9) Canadian_Archipelago': 852715.31, ' (10) Hudson_Bay': 1260779.00, ' (11) Central_Arctic': 3240326.47, ' (12) Bering_Sea': 692832.54, ' (13) Baltic_Sea': 21478.25, ' (14) Sea_of_Okhotsk': 424563.54, ' (15) Yellow_Sea': 14830.45, ' (16) Cook_Inlet': 8202.95, 'dataset': 'test_set' }] self.assertEqual(test_docs, expected_docs)
def test_dataframe_input_string(self) -> None: csv_processor = CsvProcessor(etk=etk, heading_row=1, content_start_row=10, content_end_row=11) file_path = './etk/unit_tests/ground_truth/masie_4km_allyears_extent_sqkm.csv' data = pd.read_csv(file_path, skiprows=1) test_docs = [ doc.cdr_document for doc in csv_processor.tabular_extractor( dataframe=data, dataset='test_set', df_string=True) ] expected_docs = [{ 'yyyyddd': '2006009', ' (0) Northern_Hemisphere': '13536887.64', ' (1) Beaufort_Sea': '1069710.81', ' (2) Chukchi_Sea': '966006.16', ' (3) East_Siberian_Sea': '1087102.72', ' (4) Laptev_Sea': '897773.37', ' (5) Kara_Sea': '927602.17', ' (6) Barents_Sea': '474574.82', ' (7) Greenland_Sea': '590029.18', ' (8) Baffin_Bay_Gulf_of_St._Lawrence': '1005790.38', ' (9) Canadian_Archipelago': '852715.31', ' (10) Hudson_Bay': '1260779.0', ' (11) Central_Arctic': '3240326.47', ' (12) Bering_Sea': '692832.54', ' (13) Baltic_Sea': '21478.25', ' (14) Sea_of_Okhotsk': '424563.54', ' (15) Yellow_Sea': '14830.45', ' (16) Cook_Inlet': '8202.95', 'dataset': 'test_set' }, { 'yyyyddd': '2006010', ' (0) Northern_Hemisphere': '13505426.35', ' (1) Beaufort_Sea': '1069710.81', ' (2) Chukchi_Sea': '966006.16', ' (3) East_Siberian_Sea': '1087102.72', ' (4) Laptev_Sea': '897773.37', ' (5) Kara_Sea': '933999.29', ' (6) Barents_Sea': '448185.27', ' (7) Greenland_Sea': '588279.64', ' (8) Baffin_Bay_Gulf_of_St._Lawrence': '1016857.87', ' (9) Canadian_Archipelago': '852715.31', ' (10) Hudson_Bay': '1260779.0', ' (11) Central_Arctic': '3217380.82', ' (12) Bering_Sea': '705348.17', ' (13) Baltic_Sea': '21493.81', ' (14) Sea_of_Okhotsk': '414191.19', ' (15) Yellow_Sea': '14830.45', ' (16) Cook_Inlet': '8202.95', 'dataset': 'test_set' }] self.assertEqual(test_docs, expected_docs)
def test_csv_file_with_no_header_not_ends_with_blank_row(self) -> None: csv_processor = CsvProcessor(etk=etk, content_start_row=1, content_end_row=8, ends_with_blank_row=False, remove_leading_empty_rows=True) filename = 'etk/unit_tests/ground_truth/sample_csv.csv' test_docs = [ doc.cdr_document for doc in csv_processor.tabular_extractor(filename=filename, dataset='test_set') ] expected_docs = [{ 'C0': '', 'C1': 'name1', 'C2': 'name2', 'C3': '', 'C4': '', 'file_name': 'etk/unit_tests/ground_truth/sample_csv.csv', 'dataset': 'test_set' }, { 'C0': 'col11', 'C1': 'col12', 'C2': 'col13', 'C3': '', 'C4': 'col15', 'file_name': 'etk/unit_tests/ground_truth/sample_csv.csv', 'dataset': 'test_set' }, { 'C0': 'col21', 'C1': 'col22', 'C2': 'col23', 'C3': 'col24', 'C4': 'col25', 'file_name': 'etk/unit_tests/ground_truth/sample_csv.csv', 'dataset': 'test_set' }, { 'C0': 'col31', 'C1': 'col32', 'C2': 'col33', 'C3': 'col34', 'C4': 'col35', 'file_name': 'etk/unit_tests/ground_truth/sample_csv.csv', 'dataset': 'test_set' }] self.assertEqual(test_docs, expected_docs)
def test_real_excel_without_sheetname(self) -> None: csv_processor = CsvProcessor(etk=etk, heading_row=1, content_start_row=10, content_end_row=12) file_path = 'etk/unit_tests/ground_truth/NST-Main Sheet.xlsx' test_docs = [ doc.cdr_document for doc in csv_processor.tabular_extractor(filename=file_path, dataset='test_set') ] expected_docs = [] self.assertEqual(test_docs, expected_docs)
def test_csv_encoding(self) -> None: csv_processor = CsvProcessor(etk=etk, heading_row=1) test_docs = [ doc.cdr_document for doc in csv_processor.tabular_extractor( filename='etk/unit_tests/ground_truth/test_encoding.csv', dataset='test_set', encoding='utf-16') ] expected_docs = [{ 'Country': 'Algeria', 'Category': 'Crude Oil Production', 'DateTime': '2/28/2018 12:00:00 AM', 'Close': '1036.0000', 'Frequency': 'Monthly', 'HistoricalDataSymbol': 'ALGERIACRUOILPRO', 'LastUpdate': '3/14/2018 2:17:00 PM', 'file_name': 'etk/unit_tests/ground_truth/test_encoding.csv', 'dataset': 'test_set' }] self.assertEqual(test_docs, expected_docs)
def test_real_excel_with_sheetname(self) -> None: csv_processor = CsvProcessor(etk=etk, heading_row=1, content_start_row=10, content_end_row=12) file_path = 'etk/unit_tests/ground_truth/NST-Main Sheet.xlsx' sheets_name = 'NST Main Dataset' test_docs = [ doc.cdr_document for doc in csv_processor.tabular_extractor( filename=file_path, sheet_name=sheets_name, dataset='test_set') ] expected_docs = [{ 'Title': 'David Usman and Shot Dead', 'Date': '2011-06-07', 'Community (city,town, ward)': 'Maiduguri', 'LGA': 'Maiduguri', 'State': 'Borno', 'Total Deaths': 2, 'Boko Haram (P)': 'Boko Haram', 'State Actor (P)': '', 'Sectarian Actor (excluding BH) (P)': '', 'Other Armed Actor (P)': '', 'Kidnapper (P)': '', 'Robber (P)': '', 'Other (P)': '', 'Election-related Actor (P)': '', 'Cameroon State Actor (P)': '', 'Boko Haram (V)': '', 'State Actor (V)': '', 'Sectarian Actor (V)': 2, 'Other Armed Actor (V)': '', 'Political Actor (V)': '', 'Kidnapper (V)': '', 'Kidnapee (V)': '', 'Robber (V)': '', 'Journalist (V)': '', 'Civilian (V)': '', 'Election-related Actor (V)': '', 'Cameroon State Actor': '', 'Bomb': '', 'Gun': 'Gun', 'Machete': '', 'Suicide Bombing': '', 'Other Weapon': '', 'TK': 'Targeted Killing', 'Drinking Establishment': '', 'Goverment Building': '', 'Church': '', 'Mosque': '', 'Bank': '', 'School': '', 'Other Location': 'Other', 'Notes': '', 'Sources 1': 'http://allafrica.com/stories/201106100373.html', 'Sources 2': 'http://www.bbc.co.uk/news/world-africa-13724349', 'Sources 3': '', 'Latitude': '', 'Longitude': '', 'full place name': 'Maiduguri, Borno, Nigeria', 'country': 'Nigeria', 'file_name': 'etk/unit_tests/ground_truth/NST-Main Sheet.xlsx', 'dataset': 'test_set' }, { 'Title': 'Explosion, Firefight at Gwange Police Station', 'Date': '2011-06-07', 'Community (city,town, ward)': 'Gwange Police Station, Maiduguri', 'LGA': 'Maiduguri', 'State': 'Borno', 'Total Deaths': 3, 'Boko Haram (P)': 'Boko Haram', 'State Actor (P)': 'State Actor', 'Sectarian Actor (excluding BH) (P)': '', 'Other Armed Actor (P)': '', 'Kidnapper (P)': '', 'Robber (P)': '', 'Other (P)': '', 'Election-related Actor (P)': '', 'Cameroon State Actor (P)': '', 'Boko Haram (V)': 3, 'State Actor (V)': '', 'Sectarian Actor (V)': '', 'Other Armed Actor (V)': '', 'Political Actor (V)': '', 'Kidnapper (V)': '', 'Kidnapee (V)': '', 'Robber (V)': '', 'Journalist (V)': '', 'Civilian (V)': '', 'Election-related Actor (V)': '', 'Cameroon State Actor': '', 'Bomb': 'Bomb', 'Gun': 'Gun', 'Machete': '', 'Suicide Bombing': '', 'Other Weapon': '', 'TK': '', 'Drinking Establishment': '', 'Goverment Building': 'Government Building', 'Church': '', 'Mosque': '', 'Bank': '', 'School': '', 'Other Location': '', 'Notes': '', 'Sources 1': 'http://www.google.com/hostednews/afp/article/ALeqM5hofvKayKKAFFtiX9-Ic5bG2ptVmg?docId=CNG.fafcacea0287fbeab90256732f165e1e.771', 'Sources 2': 'http://news.xinhuanet.com/english2010/world/2011-06/08/c_13915959.htm', 'Sources 3': '', 'Latitude': '', 'Longitude': '', 'full place name': 'Maiduguri, Borno, Nigeria', 'country': 'Nigeria', 'file_name': 'etk/unit_tests/ground_truth/NST-Main Sheet.xlsx', 'dataset': 'test_set' }, { 'Title': 'Explosions at Dandal Police Station', 'Date': '2011-06-07', 'Community (city,town, ward)': 'Dandal Police Station, Maiduguri', 'LGA': 'Maiduguri', 'State': 'Borno', 'Total Deaths': 0, 'Boko Haram (P)': 'Boko Haram', 'State Actor (P)': '', 'Sectarian Actor (excluding BH) (P)': '', 'Other Armed Actor (P)': '', 'Kidnapper (P)': '', 'Robber (P)': '', 'Other (P)': '', 'Election-related Actor (P)': '', 'Cameroon State Actor (P)': '', 'Boko Haram (V)': '', 'State Actor (V)': 0, 'Sectarian Actor (V)': '', 'Other Armed Actor (V)': '', 'Political Actor (V)': '', 'Kidnapper (V)': '', 'Kidnapee (V)': '', 'Robber (V)': '', 'Journalist (V)': '', 'Civilian (V)': '', 'Election-related Actor (V)': '', 'Cameroon State Actor': '', 'Bomb': 'Bomb', 'Gun': '', 'Machete': '', 'Suicide Bombing': '', 'Other Weapon': '', 'TK': '', 'Drinking Establishment': '', 'Goverment Building': 'Government Building', 'Church': '', 'Mosque': '', 'Bank': '', 'School': '', 'Other Location': '', 'Notes': '', 'Sources 1': 'http://news.xinhuanet.com/english2010/world/2011-06/08/c_13915959.htm', 'Sources 2': 'http://www.google.com/hostednews/afp/article/ALeqM5hofvKayKKAFFtiX9-Ic5bG2ptVmg?docId=CNG.fafcacea0287fbeab90256732f165e1e.771', 'Sources 3': '', 'Latitude': '', 'Longitude': '', 'full place name': 'Maiduguri, Borno, Nigeria', 'country': 'Nigeria', 'file_name': 'etk/unit_tests/ground_truth/NST-Main Sheet.xlsx', 'dataset': 'test_set' }] self.assertEqual(test_docs, expected_docs)
def test_tab_file(self): csv_processor = CsvProcessor(etk=etk, heading_row=1) test_str = 'Event ID\tEvent Date\tSource Name\tSource Sectors\tSource Country\tEvent Text\tCAMEO Code\tIntensity\tTarget Name\tTarget Sectors\tTarget Country\tStory ID\tSentence Number\tPublisher\tCity\tDistrict\tProvince\tCountry\tLatitude\tLongitude\n926685\t1995-01-01\tExtremist (Russia)\tRadicals / Extremists / Fundamentalists,Dissident\tRussian Federation\tPraise or endorse\t051\t3.4\tBoris Yeltsin\tElite,Executive,Executive Office,Government\tRussian Federation\t28235806\t5\tThe Toronto Star\tMoscow\t\tMoskva\tRussian Federation\t55.7522\t37.6156\n926687\t1995-01-01\tGovernment (Bosnia and Herzegovina)\tGovernment\tBosnia and Herzegovina\tExpress intent to cooperate\t030\t4\tCitizen (Serbia)\tGeneral Population / Civilian / Social,Social\tSerbia\t28235807\t1\tThe Toronto Star\t\t\tBosnia\tBosnia and Herzegovina\t44\t18\n926686\t1995-01-01\tCitizen (Serbia)\tGeneral Population / Civilian / Social,Social\tSerbia\tExpress intent to cooperate\t030\t4\tGovernment (Bosnia and Herzegovina)\tGovernment\tBosnia and Herzegovina\t28235807\t1\tThe Toronto Star\t\t\tBosnia\tBosnia and Herzegovina\t44\t18\n926688\t1995-01-01\tCanada\t\tCanada\tPraise or endorse\t051\t3.4\tCity Mayor (Canada)\tGovernment,Local,Municipal\tCanada\t28235809\t3\tThe Toronto Star\t\t\tOntario\tCanada\t49.2501\t-84.4998\n' test_docs = [ doc.cdr_document for doc in csv_processor.tabular_extractor(filename='testfile.tab', file_type='tsv', file_content=test_str, dataset='test_set') ] expected_docs = [{ 'Event ID': 926685, 'Event Date': '1995-01-01', 'Source Name': 'Extremist (Russia)', 'Source Sectors': 'Radicals / Extremists / Fundamentalists,Dissident', 'Source Country': 'Russian Federation', 'Event Text': 'Praise or endorse', 'CAMEO Code': '051', 'Intensity': '3.4', 'Target Name': 'Boris Yeltsin', 'Target Sectors': 'Elite,Executive,Executive Office,Government', 'Target Country': 'Russian Federation', 'Story ID': 28235806, 'Sentence Number': 5, 'Publisher': 'The Toronto Star', 'City': 'Moscow', 'District': '', 'Province': 'Moskva', 'Country': 'Russian Federation', 'Latitude': '55.7522', 'Longitude': '37.6156', 'file_name': 'testfile.tab', 'dataset': 'test_set' }, { 'Event ID': 926687, 'Event Date': '1995-01-01', 'Source Name': 'Government (Bosnia and Herzegovina)', 'Source Sectors': 'Government', 'Source Country': 'Bosnia and Herzegovina', 'Event Text': 'Express intent to cooperate', 'CAMEO Code': '030', 'Intensity': 4, 'Target Name': 'Citizen (Serbia)', 'Target Sectors': 'General Population / Civilian / Social,Social', 'Target Country': 'Serbia', 'Story ID': 28235807, 'Sentence Number': 1, 'Publisher': 'The Toronto Star', 'City': '', 'District': '', 'Province': 'Bosnia', 'Country': 'Bosnia and Herzegovina', 'Latitude': 44, 'Longitude': 18, 'file_name': 'testfile.tab', 'dataset': 'test_set' }, { 'Event ID': 926686, 'Event Date': '1995-01-01', 'Source Name': 'Citizen (Serbia)', 'Source Sectors': 'General Population / Civilian / Social,Social', 'Source Country': 'Serbia', 'Event Text': 'Express intent to cooperate', 'CAMEO Code': '030', 'Intensity': 4, 'Target Name': 'Government (Bosnia and Herzegovina)', 'Target Sectors': 'Government', 'Target Country': 'Bosnia and Herzegovina', 'Story ID': 28235807, 'Sentence Number': 1, 'Publisher': 'The Toronto Star', 'City': '', 'District': '', 'Province': 'Bosnia', 'Country': 'Bosnia and Herzegovina', 'Latitude': 44, 'Longitude': 18, 'file_name': 'testfile.tab', 'dataset': 'test_set' }, { 'Event ID': 926688, 'Event Date': '1995-01-01', 'Source Name': 'Canada', 'Source Sectors': '', 'Source Country': 'Canada', 'Event Text': 'Praise or endorse', 'CAMEO Code': '051', 'Intensity': '3.4', 'Target Name': 'City Mayor (Canada)', 'Target Sectors': 'Government,Local,Municipal', 'Target Country': 'Canada', 'Story ID': 28235809, 'Sentence Number': 3, 'Publisher': 'The Toronto Star', 'City': '', 'District': '', 'Province': 'Ontario', 'Country': 'Canada', 'Latitude': '49.2501', 'Longitude': '-84.4998', 'file_name': 'testfile.tab', 'dataset': 'test_set' }] self.assertEqual(test_docs, expected_docs)
def test_real_csv_file_1(self) -> None: csv_processor = CsvProcessor(etk=etk, heading_row=1, content_end_row=4, ends_with_blank_row=False) file_path = 'etk/unit_tests/ground_truth/acled_raw_data.csv' test_docs = [ doc.cdr_document for doc in csv_processor.tabular_extractor(filename=file_path, dataset='test_set') ] expected_docs = [{ 'data_id': 336907, 'iso': 180, 'event_id_cnty': 'DRC11776', 'event_id_no_cnty': 11776, 'event_date': '2018-01-13', 'year': 2018, 'time_precision': 1, 'event_type': 'Battle-No change of territory', 'actor1': 'Military Forces of Democratic Republic of Congo (2001-)', 'assoc_actor_1': '', 'inter1': 1, 'actor2': 'ADF: Allied Democratic Forces', 'assoc_actor_2': '', 'inter2': 2, 'interaction': 12, 'region': 'Central Africa', 'country': 'Democratic Republic of Congo', 'admin1': 'Nord-Kivu', 'admin2': 'Nord-Kivu', 'admin3': 'Oicha', 'location': 'Oicha', 'latitude': '0.7', 'longitude': '29.5167', 'geo_precision': 1, 'source': 'Radio Okapi', 'source_scale': 'Subnational', 'notes': "Presumed FARDC attacked the ADF in the periphery of Oicha on January 13th. Shots were heard " "in the locality and it is suspected that the FARDC are attacking the 'death triangle' situated in between " "Mbau, Kamango and Eringeti. The reports are not confirmed by the military.", 'fatalities': 0, 'timestamp': 1516117305, 'file_name': 'etk/unit_tests/ground_truth/acled_raw_data.csv', 'dataset': 'test_set' }, { 'data_id': 336908, 'iso': 180, 'event_id_cnty': 'DRC11777', 'event_id_no_cnty': 11777, 'event_date': '2018-01-13', 'year': 2018, 'time_precision': 1, 'event_type': 'Battle-No change of territory', 'actor1': 'Military Forces of Democratic Republic of Congo (2001-)', 'assoc_actor_1': '', 'inter1': 1, 'actor2': 'ADF: Allied Democratic Forces', 'assoc_actor_2': '', 'inter2': 2, 'interaction': 12, 'region': 'Central Africa', 'country': 'Democratic Republic of Congo', 'admin1': 'Nord-Kivu', 'admin2': 'Beni', 'admin3': 'Beni', 'location': 'Beni', 'latitude': '0.49658', 'longitude': '29.4654', 'geo_precision': 1, 'source': 'Reuters; Radio Okapi', 'source_scale': 'Subnational', 'notes': 'The FARDC launched, on January 13th, an offensive against the ADF in Beni and Lubero, ' 'in response to the recents attacks by the group. Gunfires and explosions were heard ' 'in Beni all throughout Saturday (13th).', 'fatalities': 0, 'timestamp': 1516117305, 'file_name': 'etk/unit_tests/ground_truth/acled_raw_data.csv', 'dataset': 'test_set' }, { 'data_id': 336909, 'iso': 180, 'event_id_cnty': 'DRC11778', 'event_id_no_cnty': 11778, 'event_date': '2018-01-13', 'year': 2018, 'time_precision': 1, 'event_type': 'Battle-No change ' 'of territory', 'actor1': 'Military Forces of Democratic Republic of Congo (2001-)', 'assoc_actor_1': '', 'inter1': 1, 'actor2': 'ADF: Allied Democratic Forces', 'assoc_actor_2': '', 'inter2': 2, 'interaction': 12, 'region': 'Central Africa', 'country': 'Democratic Republic of Congo', 'admin1': 'Nord-Kivu', 'admin2': 'Nord-Kivu', 'admin3': 'Lubero', 'location': 'Lubero', 'latitude': '-0.15867', 'longitude': '29.2386', 'geo_precision': 1, 'source': 'Reuters; Radio Okapi', 'source_scale': 'Subnational', 'notes': 'The FARDC launched, on January 13th, an offensive ' 'against the ADF in Beni and Lubero, in response to the recents attacks by the group.', 'fatalities': 0, 'timestamp': 1516117305, 'file_name': 'etk/unit_tests/ground_truth/acled_raw_data.csv', 'dataset': 'test_set' }] self.assertEqual(test_docs, expected_docs)
def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'acled_date_parser') self.country_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/countries.json.gz", read_json=True), "country_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.states_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/states_usa_canada.json.gz", read_json=True), "states_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.cities_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/cities.json.gz", read_json=True), "cities_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.csv_processor = CsvProcessor(etk=etk, heading_row=1) self.interaction_decoding_dict = { "10": "Sole Military Action", "11": "Military Versus Military", "12": "Military Versus Rebels", "13": "Military Versus Political Militia", "14": "Military Versus Communal Militia", "15": "Military Versus Rioters", "16": "Military Versus Protesters", "17": "Military Versus Civilians", "18": "Military Versus Other", "20": "Sole Rebel Action", "22": "Rebels Versus Rebels", "23": "Rebels Versus Political Militia", "24": "Rebels Versus Communal Militia", "25": "Rebels Versus Rioters", "26": "Rebels Versus Protesters", "27": "Rebels Versus Civilians", "28": "Rebels Versus Other", "30": "Sole Political Militia Action", "33": "Political Militia Versus Political Militia", "34": "Political Militia Versus Communal Militia", "35": "Political Militia Versus Rioters", "36": "Political Militia Versus Protesters", "37": "Political Militia Versus Civilians", "38": "Political Militia Versus Other", "40": "Sole Communal Militia Action", "44": "Communal Militia Versus Communal Militia", "45": "Communal Militia Versus Rioters", "46": "Communal Militia Versus Protesters", "47": "Communal Militia Versus Civilians", "48": "Communal Militia Versus Other", "50": "Sole Rioter Action", "55": "Rioters Versus Rioters", "56": "Rioters Versus Protesters", "57": "Rioters Versus Civilians", "58": "Rioters Versus Other", "60": "Sole Protester Action", "66": "Protesters Versus Protesters", "68": "Protesters Versus Other", "78": "Other Actor Versus Civilians", "80": "Sole Other Action" } self.interaction_decoder = DecodingValueExtractor( self.interaction_decoding_dict, 'default_decoding', case_sensitive=True)
def __init__(self, etk): ETKModule.__init__(self, etk) def process_document(self, doc): pass if __name__ == "__main__": csv_str = """text,with,Polish,non-Latin,lettes 1,2,3,4,5,6 a,b,c,d,e,f gęś,zółty,wąż,idzie,wąską,dróżką, ,b,c,s,w,f """ etk = ETK(modules=CsvETKModule) cp = CsvProcessor(etk=etk, heading_row=1, heading_columns=(1, 3), content_end_row=3, ends_with_blank_row=True, remove_leading_empty_rows=True, required_columns=['text']) data_set = 'test_data_set_csv' docs = [ doc.cdr_document for doc in cp.tabular_extractor(table_str=csv_str, dataset='test_csv_str_with_all_args') ] pprint.pprint(docs)
# for segment in doc.select_segments(jsonpath='$.notes'): # doc.kg.add_value("description", segment.value) doc.kg.add_value("description", json_path='$.notes') def document_selector(self, doc) -> bool: """ Boolean function for selecting document Args: doc: Document Returns: """ return DefaultDocumentSelector().select_document(doc) if __name__ == "__main__": kg_schema = KGSchema(json.load(open('master_config.json'))) etk = ETK(modules=AcledModule, kg_schema=kg_schema) cp = CsvProcessor(etk=etk, heading_row=1) data_set = 'test_data_set_csv' docs = cp.tabular_extractor(filename="acled_raw_data.csv", dataset='acled', doc_id_field="data_id") results = etk.process_ems(docs[0]) print(json.dumps(results[0].value, indent=2))
value=doc.extract( self.country_decoder, doc.select_segments("$.ActorCountryCode")[0])) # Note: not mapping the Actor Geo codes, because Pedro doesn't understand what they mean. return list() if __name__ == "__main__": # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema. kg_schema = KGSchema(json.load(open('../events_ucdp/master_config.json'))) # Instantiate ETK, with the two processing modules and the schema. etk = ETK(modules=[GdeltModule, GdeltActorModule], kg_schema=kg_schema) # Create a CSV processor to create documents for the relevant rows in the TSV file cp = CsvProcessor(etk=etk, heading_columns=(1, len(GdeltModule.header_fields)), column_name_prefix="COL") with open("gdelt.jl", "w") as f: # Iterate over all the rows in the spredsheet for d in cp.tabular_extractor(filename="20170912.export_sample.tsv", dataset='gdelt'): for result in etk.process_ems(d): # print(d.cdr_document) # print(json.dumps(result.cdr_document.get("knowledge_graph"), indent=2)) print(result.cdr_document.get("knowledge_graph")) f.write(json.dumps(result.cdr_document) + "\n")
def test_dataframe_input(self) -> None: csv_processor = CsvProcessor(etk=etk, heading_row=1, content_end_row=2, ends_with_blank_row=False) file_path = 'etk/unit_tests/ground_truth/acled_raw_data.csv' data = pd.read_csv(file_path) test_docs = [ doc.cdr_document for doc in csv_processor.tabular_extractor(dataframe=data, dataset='test_set') ] expected_docs = [{ 'data_id': 336907, 'iso': 180, 'event_id_cnty': 'DRC11776', 'event_id_no_cnty': 11776, 'event_date': '2018-01-13', 'year': 2018, 'time_precision': 1, 'event_type': 'Battle-No change of territory', 'actor1': 'Military Forces of Democratic Republic of Congo (2001-)', 'assoc_actor_1': np.nan, 'inter1': 1, 'actor2': 'ADF: Allied Democratic Forces', 'assoc_actor_2': np.nan, 'inter2': 2, 'interaction': 12, 'region': 'Central Africa', 'country': 'Democratic Republic of Congo', 'admin1': 'Nord-Kivu', 'admin2': 'Nord-Kivu', 'admin3': 'Oicha', 'location': 'Oicha', 'latitude': 0.7, 'longitude': 29.5167, 'geo_precision': 1, 'source': 'Radio Okapi', 'source_scale': 'Subnational', 'notes': "Presumed FARDC attacked the ADF in the periphery of Oicha on January 13th. Shots were heard " "in the locality and it is suspected that the FARDC are attacking the 'death triangle' situated in between " "Mbau, Kamango and Eringeti. The reports are not confirmed by the military.", 'fatalities': 0, 'timestamp': 1516117305, 'dataset': 'test_set' }] self.assertEqual(test_docs, expected_docs)
return doc.cdr_document.get( "dataset") == "lake_chad_basin_displaced_victim" def process_document(self, doc: Document) -> List[Document]: doc.kg.add_value("size", json_path="total") doc.kg.add_value("type", json_path="type") return list() if __name__ == "__main__": dir_path = sys.argv[1] master_config_path = sys.argv[2] file_name = 'lake_chad_basin_displaced.csv' input_path = os.path.join(dir_path, file_name) output_path = os.path.join(dir_path, file_name + '.jl') kg_schema = KGSchema(json.load(open(master_config_path))) etk = ETK(modules=[ LakeChadBasinDisplacedModule, LakeChadBasinDisplacedVictimModule, LCBPlaceModule ], kg_schema=kg_schema) cp = CsvProcessor(etk=etk, heading_row=1, content_start_row=3) with open(output_path, "w") as f: print(input_path, output_path) for doc in cp.tabular_extractor(filename=input_path, dataset='lake_chad_basin_displaced'): etk.process_and_frame(doc) f.write(json.dumps(doc.cdr_document) + "\n")
class AcledModule(ETKModule): def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'acled_date_parser') self.country_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/countries.json.gz", read_json=True), "country_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.states_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/states_usa_canada.json.gz", read_json=True), "states_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.cities_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/cities.json.gz", read_json=True), "cities_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.csv_processor = CsvProcessor(etk=etk, heading_row=1) self.interaction_decoding_dict = { "10": "Sole Military Action", "11": "Military Versus Military", "12": "Military Versus Rebels", "13": "Military Versus Political Militia", "14": "Military Versus Communal Militia", "15": "Military Versus Rioters", "16": "Military Versus Protesters", "17": "Military Versus Civilians", "18": "Military Versus Other", "20": "Sole Rebel Action", "22": "Rebels Versus Rebels", "23": "Rebels Versus Political Militia", "24": "Rebels Versus Communal Militia", "25": "Rebels Versus Rioters", "26": "Rebels Versus Protesters", "27": "Rebels Versus Civilians", "28": "Rebels Versus Other", "30": "Sole Political Militia Action", "33": "Political Militia Versus Political Militia", "34": "Political Militia Versus Communal Militia", "35": "Political Militia Versus Rioters", "36": "Political Militia Versus Protesters", "37": "Political Militia Versus Civilians", "38": "Political Militia Versus Other", "40": "Sole Communal Militia Action", "44": "Communal Militia Versus Communal Militia", "45": "Communal Militia Versus Rioters", "46": "Communal Militia Versus Protesters", "47": "Communal Militia Versus Civilians", "48": "Communal Militia Versus Other", "50": "Sole Rioter Action", "55": "Rioters Versus Rioters", "56": "Rioters Versus Protesters", "57": "Rioters Versus Civilians", "58": "Rioters Versus Other", "60": "Sole Protester Action", "66": "Protesters Versus Protesters", "68": "Protesters Versus Other", "78": "Other Actor Versus Civilians", "80": "Sole Other Action" } self.interaction_decoder = DecodingValueExtractor( self.interaction_decoding_dict, 'default_decoding', case_sensitive=True) def process_document(self, cdr_doc: Document): new_docs = list() cdr_doc_json = cdr_doc.cdr_document if 'raw_content_path' in cdr_doc_json and cdr_doc_json[ 'raw_content_path'].strip() != '': try: docs = self.csv_processor.tabular_extractor( filename=cdr_doc_json['raw_content_path'], dataset='acleddata', doc_id_field="data_id") for doc in docs: doc_json = doc.cdr_document event_date = doc.select_segments(jsonpath='$.event_date') for segment in event_date: extractions = doc.extract( extractor=self.date_extractor, extractable=segment) for extraction in extractions: doc.kg.add_value("event_date", value=extraction.value) doc.kg.add_value("website", value='acleddata.com') doc.kg.add_value("description", json_path='$.notes') acled_title = "{event_date}: {event_type} in {location}".format( event_date=doc.cdr_document.get("event_date", ''), event_type=doc.cdr_document.get("event_type", ''), location=doc.cdr_document.get("location", '')) doc.kg.add_value("title", value=acled_title) doc.kg.add_value('country', json_path="$.country") states_segments = doc.select_segments("$.state") for state_segment in states_segments: extracted_states = doc.extract(self.states_extractor, state_segment) doc.kg.add_value("state", value=extracted_states) interaction_segments = doc.select_segments("$.interaction") for interaction_segment in interaction_segments: extracted_interaction = doc.extract( self.interaction_decoder, interaction_segment) doc.kg.add_value("type", value=extracted_interaction) doc.kg.add_value("type", json_path="$.event_type") doc.kg.add_value("type", json_path="$.type") doc.kg.add_value("type", value="Event") location_segments = doc.select_segments("$.location") for location_segment in location_segments: ec_location = doc.extract(self.country_extractor, location_segment) doc.kg.add_value("country", ec_location) ecity_location = doc.extract(self.cities_extractor, location_segment) doc.kg.add_value("city_name", ecity_location) es_location = doc.extract(self.states_extractor, location_segment) doc.kg.add_value("state", es_location) new_docs.append(doc) except Exception as e: raise Exception('Error in AcledModule', e) return new_docs def document_selector(self, doc) -> bool: """ Boolean function for selecting document Args: doc: Document Returns: """ return doc.cdr_document.get("dataset") == "acleddata"
if __name__ == "__main__": # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema. kg_schema = KGSchema(json.load(open('master_config.json'))) # Instantiate ETK, with the two processing modules and the schema. etk = ETK(modules=[ GTDModule, GTDDamageModule, GTDInjuriesModule, GTDFatalitiesModule, GTDWeaponsModule, GTDActorModule, GTDVictimModule, GTDPlaceModule ], kg_schema=kg_schema) # Create a CSV processor to create documents for the relevant rows in the Excel sheet cp = CsvProcessor(etk=etk, heading_row=1) with open("gtd.jl", "w") as f: # Iterate over all the rows in the spredsheet for doc in cp.tabular_extractor( filename="globalterrorismdb_0617dist-nigeria.csv", dataset='gtd'): # print(json.dumps(doc.value, indent=2)) # exit(0) etk.process_and_frame(doc) f.write(json.dumps(doc.cdr_document) + "\n") # print(json.dumps(doc.value, indent=2)) # exit(0) # for result in etk.process_ems(doc): # print(json.dumps(result.cdr_document["knowledge_graph"], indent=2)) # exit(0)
ETKModule.__init__(self, etk) def process_document(self, doc): pass if __name__ == "__main__": csv_str = """text,with,Polish,non-Latin,lettes 1,2,3,4,5,6 a,b,c,d,e,f """ file_path = 'etk/unit_tests/ground_truth/queryResults.csv' jl_file_path = 'etk/unit_tests/ground_truth/queryResults.jl' etk = ETK(modules=CsvETKModule) csv_processor = CsvProcessor(etk=etk, heading_row=2, content_start_row=10, content_end_row=1723) data_set = 'test_data_set_csv' test_docs = [ doc.cdr_document for doc in csv_processor.tabular_extractor(filename=file_path, dataset='test_set') ] """docs = [doc.cdr_document for doc in cp.tabular_extractor(table_str=filename, data_set='test_csv_str_with_all_args')]""" news_path = 'etk/unit_tests/ground_truth/queryResults.jl' news_data = open(news_path, 'w') news_data.write(json.dumps(test_docs)) #pprint.pprint(test_docs)
# Add a title to the actor document doc.kg.add_value("title", json_path="$.Side") # Return an empty list because we didn't create new documents return [] # The main is for testing, and is not used in the DIG pipeline if __name__ == "__main__": # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema. kg_schema = KGSchema(json.load(open('master_config.json'))) # Instantiate ETK, with the two processing modules and the schema. etk = ETK(modules=[UCDPModule, UCDPActorModule], kg_schema=kg_schema) # Create a CSV processor to create documents for the relevant rows in the Excel sheet cp = CsvProcessor(etk=etk, heading_row=1) with open("ucdp.jl", "w") as f: # Iterate over all the rows in the spredsheet for doc in cp.tabular_extractor(filename="ucdp_sample.xls", dataset='ucdp'): # Each row produces a document, which we sent to ETK. # Note that each invocation of process_ems will also process any new documents created while # processing each doc etk.process_and_frame(doc) f.write(json.dumps(doc.cdr_document) + "\n") # for result in etk.process_ems(doc): # # print(result.cdr_document["knowledge_graph"]) # f.write(json.dumps(result.cdr_document) + "\n")
def convert_plus_to_list(value): vals = value.split('+') vals = [val.strip() for val in vals] for val in vals: if 'has_topic' in val: vals.remove(val) return vals if __name__ == "__main__": parser = OptionParser() (c_options, args) = parser.parse_args() input_path = args[0] output_path = args[1] etk = ETK(modules=CsvETKModule) csv_processor = CsvProcessor(etk=etk, heading_row=1) data_set = 'elicit_gdelt_mapping' parsed_docs = [ doc.cdr_document for doc in csv_processor.tabular_extractor(filename=input_path, dataset='elicit_mapping') ] open(output_path, 'w').write(json.dumps(parse_gdelt_mapping(parsed_docs), indent=2))