示例#1
0
    def test_csv_str_with_ends_with_blank_row_false(self) -> None:
        csv_processor = CsvProcessor(etk=etk,
                                     heading_row=1,
                                     content_start_row=2,
                                     heading_columns=(1, 3),
                                     content_end_row=4,
                                     ends_with_blank_row=False,
                                     remove_leading_empty_rows=True,
                                     required_columns=['text'])

        test_docs = [
            doc.cdr_document
            for doc in csv_processor.tabular_extractor(table_str=csv_str,
                                                       dataset='test_set')
        ]

        expected_docs = [{
            'text': '1',
            'with': '2',
            'Polish': '3',
            'non-Latin': '4',
            'lettes': '5',
            'dataset': 'test_set'
        }, {
            'text': 'a',
            'with': 'b',
            'Polish': 'c',
            'non-Latin': 'd',
            'lettes': 'e',
            'dataset': 'test_set'
        }]

        self.assertEqual(test_docs, expected_docs)
示例#2
0
    def test_dataframe_input_2(self) -> None:
        csv_processor = CsvProcessor(etk=etk,
                                     heading_row=1,
                                     content_start_row=9,
                                     content_end_row=10)

        file_path = './etk/unit_tests/ground_truth/masie_4km_allyears_extent_sqkm.csv'
        data = pd.read_csv(file_path, skiprows=1)
        test_docs = [
            doc.cdr_document
            for doc in csv_processor.tabular_extractor(dataframe=data,
                                                       dataset='test_set')
        ]

        expected_docs = [{
            'yyyyddd': 2006008,
            ' (0) Northern_Hemisphere': 13536736.84,
            ' (1) Beaufort_Sea': 1069710.81,
            ' (2) Chukchi_Sea': 966006.16,
            ' (3) East_Siberian_Sea': 1087102.72,
            ' (4) Laptev_Sea': 897773.37,
            ' (5) Kara_Sea': 927602.17,
            ' (6) Barents_Sea': 474574.82,
            ' (7) Greenland_Sea': 590029.18,
            ' (8) Baffin_Bay_Gulf_of_St._Lawrence': 1005790.38,
            ' (9) Canadian_Archipelago': 852715.31,
            ' (10) Hudson_Bay': 1260779.00,
            ' (11) Central_Arctic': 3240326.47,
            ' (12) Bering_Sea': 692832.54,
            ' (13) Baltic_Sea': 21327.46,
            ' (14) Sea_of_Okhotsk': 424563.54,
            ' (15) Yellow_Sea': 14830.45,
            ' (16) Cook_Inlet': 8202.95,
            'dataset': 'test_set'
        }, {
            'yyyyddd': 2006009,
            ' (0) Northern_Hemisphere': 13536887.64,
            ' (1) Beaufort_Sea': 1069710.81,
            ' (2) Chukchi_Sea': 966006.16,
            ' (3) East_Siberian_Sea': 1087102.72,
            ' (4) Laptev_Sea': 897773.37,
            ' (5) Kara_Sea': 927602.17,
            ' (6) Barents_Sea': 474574.82,
            ' (7) Greenland_Sea': 590029.18,
            ' (8) Baffin_Bay_Gulf_of_St._Lawrence': 1005790.38,
            ' (9) Canadian_Archipelago': 852715.31,
            ' (10) Hudson_Bay': 1260779.00,
            ' (11) Central_Arctic': 3240326.47,
            ' (12) Bering_Sea': 692832.54,
            ' (13) Baltic_Sea': 21478.25,
            ' (14) Sea_of_Okhotsk': 424563.54,
            ' (15) Yellow_Sea': 14830.45,
            ' (16) Cook_Inlet': 8202.95,
            'dataset': 'test_set'
        }]

        self.assertEqual(test_docs, expected_docs)
示例#3
0
    def test_dataframe_input_string(self) -> None:
        csv_processor = CsvProcessor(etk=etk,
                                     heading_row=1,
                                     content_start_row=10,
                                     content_end_row=11)

        file_path = './etk/unit_tests/ground_truth/masie_4km_allyears_extent_sqkm.csv'
        data = pd.read_csv(file_path, skiprows=1)
        test_docs = [
            doc.cdr_document for doc in csv_processor.tabular_extractor(
                dataframe=data, dataset='test_set', df_string=True)
        ]

        expected_docs = [{
            'yyyyddd': '2006009',
            ' (0) Northern_Hemisphere': '13536887.64',
            ' (1) Beaufort_Sea': '1069710.81',
            ' (2) Chukchi_Sea': '966006.16',
            ' (3) East_Siberian_Sea': '1087102.72',
            ' (4) Laptev_Sea': '897773.37',
            ' (5) Kara_Sea': '927602.17',
            ' (6) Barents_Sea': '474574.82',
            ' (7) Greenland_Sea': '590029.18',
            ' (8) Baffin_Bay_Gulf_of_St._Lawrence': '1005790.38',
            ' (9) Canadian_Archipelago': '852715.31',
            ' (10) Hudson_Bay': '1260779.0',
            ' (11) Central_Arctic': '3240326.47',
            ' (12) Bering_Sea': '692832.54',
            ' (13) Baltic_Sea': '21478.25',
            ' (14) Sea_of_Okhotsk': '424563.54',
            ' (15) Yellow_Sea': '14830.45',
            ' (16) Cook_Inlet': '8202.95',
            'dataset': 'test_set'
        }, {
            'yyyyddd': '2006010',
            ' (0) Northern_Hemisphere': '13505426.35',
            ' (1) Beaufort_Sea': '1069710.81',
            ' (2) Chukchi_Sea': '966006.16',
            ' (3) East_Siberian_Sea': '1087102.72',
            ' (4) Laptev_Sea': '897773.37',
            ' (5) Kara_Sea': '933999.29',
            ' (6) Barents_Sea': '448185.27',
            ' (7) Greenland_Sea': '588279.64',
            ' (8) Baffin_Bay_Gulf_of_St._Lawrence': '1016857.87',
            ' (9) Canadian_Archipelago': '852715.31',
            ' (10) Hudson_Bay': '1260779.0',
            ' (11) Central_Arctic': '3217380.82',
            ' (12) Bering_Sea': '705348.17',
            ' (13) Baltic_Sea': '21493.81',
            ' (14) Sea_of_Okhotsk': '414191.19',
            ' (15) Yellow_Sea': '14830.45',
            ' (16) Cook_Inlet': '8202.95',
            'dataset': 'test_set'
        }]

        self.assertEqual(test_docs, expected_docs)
示例#4
0
    def test_csv_file_with_no_header_not_ends_with_blank_row(self) -> None:
        csv_processor = CsvProcessor(etk=etk,
                                     content_start_row=1,
                                     content_end_row=8,
                                     ends_with_blank_row=False,
                                     remove_leading_empty_rows=True)
        filename = 'etk/unit_tests/ground_truth/sample_csv.csv'

        test_docs = [
            doc.cdr_document
            for doc in csv_processor.tabular_extractor(filename=filename,
                                                       dataset='test_set')
        ]

        expected_docs = [{
            'C0': '',
            'C1': 'name1',
            'C2': 'name2',
            'C3': '',
            'C4': '',
            'file_name': 'etk/unit_tests/ground_truth/sample_csv.csv',
            'dataset': 'test_set'
        }, {
            'C0': 'col11',
            'C1': 'col12',
            'C2': 'col13',
            'C3': '',
            'C4': 'col15',
            'file_name': 'etk/unit_tests/ground_truth/sample_csv.csv',
            'dataset': 'test_set'
        }, {
            'C0': 'col21',
            'C1': 'col22',
            'C2': 'col23',
            'C3': 'col24',
            'C4': 'col25',
            'file_name': 'etk/unit_tests/ground_truth/sample_csv.csv',
            'dataset': 'test_set'
        }, {
            'C0': 'col31',
            'C1': 'col32',
            'C2': 'col33',
            'C3': 'col34',
            'C4': 'col35',
            'file_name': 'etk/unit_tests/ground_truth/sample_csv.csv',
            'dataset': 'test_set'
        }]

        self.assertEqual(test_docs, expected_docs)
示例#5
0
    def test_real_excel_without_sheetname(self) -> None:
        csv_processor = CsvProcessor(etk=etk,
                                     heading_row=1,
                                     content_start_row=10,
                                     content_end_row=12)

        file_path = 'etk/unit_tests/ground_truth/NST-Main Sheet.xlsx'

        test_docs = [
            doc.cdr_document
            for doc in csv_processor.tabular_extractor(filename=file_path,
                                                       dataset='test_set')
        ]

        expected_docs = []

        self.assertEqual(test_docs, expected_docs)
示例#6
0
    def test_csv_encoding(self) -> None:
        csv_processor = CsvProcessor(etk=etk, heading_row=1)
        test_docs = [
            doc.cdr_document for doc in csv_processor.tabular_extractor(
                filename='etk/unit_tests/ground_truth/test_encoding.csv',
                dataset='test_set',
                encoding='utf-16')
        ]
        expected_docs = [{
            'Country': 'Algeria',
            'Category': 'Crude Oil Production',
            'DateTime': '2/28/2018 12:00:00 AM',
            'Close': '1036.0000',
            'Frequency': 'Monthly',
            'HistoricalDataSymbol': 'ALGERIACRUOILPRO',
            'LastUpdate': '3/14/2018 2:17:00 PM',
            'file_name': 'etk/unit_tests/ground_truth/test_encoding.csv',
            'dataset': 'test_set'
        }]

        self.assertEqual(test_docs, expected_docs)
示例#7
0
    def test_real_excel_with_sheetname(self) -> None:
        csv_processor = CsvProcessor(etk=etk,
                                     heading_row=1,
                                     content_start_row=10,
                                     content_end_row=12)

        file_path = 'etk/unit_tests/ground_truth/NST-Main Sheet.xlsx'

        sheets_name = 'NST Main Dataset'

        test_docs = [
            doc.cdr_document for doc in csv_processor.tabular_extractor(
                filename=file_path, sheet_name=sheets_name, dataset='test_set')
        ]

        expected_docs = [{
            'Title': 'David Usman and  Shot Dead',
            'Date': '2011-06-07',
            'Community (city,town, ward)': 'Maiduguri',
            'LGA': 'Maiduguri',
            'State': 'Borno',
            'Total Deaths': 2,
            'Boko Haram (P)': 'Boko Haram',
            'State Actor (P)': '',
            'Sectarian Actor (excluding BH) (P)': '',
            'Other Armed Actor (P)': '',
            'Kidnapper (P)': '',
            'Robber (P)': '',
            'Other (P)': '',
            'Election-related Actor (P)': '',
            'Cameroon State Actor (P)': '',
            'Boko Haram (V)': '',
            'State Actor (V)': '',
            'Sectarian Actor (V)': 2,
            'Other Armed Actor (V)': '',
            'Political Actor (V)': '',
            'Kidnapper (V)': '',
            'Kidnapee (V)': '',
            'Robber (V)': '',
            'Journalist (V)': '',
            'Civilian (V)': '',
            'Election-related Actor (V)': '',
            'Cameroon State Actor': '',
            'Bomb': '',
            'Gun': 'Gun',
            'Machete': '',
            'Suicide Bombing': '',
            'Other Weapon': '',
            'TK': 'Targeted Killing',
            'Drinking Establishment': '',
            'Goverment Building': '',
            'Church': '',
            'Mosque': '',
            'Bank': '',
            'School': '',
            'Other Location': 'Other',
            'Notes': '',
            'Sources 1': 'http://allafrica.com/stories/201106100373.html',
            'Sources 2': 'http://www.bbc.co.uk/news/world-africa-13724349',
            'Sources 3': '',
            'Latitude': '',
            'Longitude': '',
            'full place name': 'Maiduguri, Borno, Nigeria',
            'country': 'Nigeria',
            'file_name': 'etk/unit_tests/ground_truth/NST-Main Sheet.xlsx',
            'dataset': 'test_set'
        }, {
            'Title': 'Explosion, Firefight at Gwange Police Station',
            'Date': '2011-06-07',
            'Community (city,town, ward)': 'Gwange Police Station, Maiduguri',
            'LGA': 'Maiduguri',
            'State': 'Borno',
            'Total Deaths': 3,
            'Boko Haram (P)': 'Boko Haram',
            'State Actor (P)': 'State Actor',
            'Sectarian Actor (excluding BH) (P)': '',
            'Other Armed Actor (P)': '',
            'Kidnapper (P)': '',
            'Robber (P)': '',
            'Other (P)': '',
            'Election-related Actor (P)': '',
            'Cameroon State Actor (P)': '',
            'Boko Haram (V)': 3,
            'State Actor (V)': '',
            'Sectarian Actor (V)': '',
            'Other Armed Actor (V)': '',
            'Political Actor (V)': '',
            'Kidnapper (V)': '',
            'Kidnapee (V)': '',
            'Robber (V)': '',
            'Journalist (V)': '',
            'Civilian (V)': '',
            'Election-related Actor (V)': '',
            'Cameroon State Actor': '',
            'Bomb': 'Bomb',
            'Gun': 'Gun',
            'Machete': '',
            'Suicide Bombing': '',
            'Other Weapon': '',
            'TK': '',
            'Drinking Establishment': '',
            'Goverment Building': 'Government Building',
            'Church': '',
            'Mosque': '',
            'Bank': '',
            'School': '',
            'Other Location': '',
            'Notes': '',
            'Sources 1':
            'http://www.google.com/hostednews/afp/article/ALeqM5hofvKayKKAFFtiX9-Ic5bG2ptVmg?docId=CNG.fafcacea0287fbeab90256732f165e1e.771',
            'Sources 2':
            'http://news.xinhuanet.com/english2010/world/2011-06/08/c_13915959.htm',
            'Sources 3': '',
            'Latitude': '',
            'Longitude': '',
            'full place name': 'Maiduguri, Borno, Nigeria',
            'country': 'Nigeria',
            'file_name': 'etk/unit_tests/ground_truth/NST-Main Sheet.xlsx',
            'dataset': 'test_set'
        }, {
            'Title': 'Explosions at Dandal Police Station',
            'Date': '2011-06-07',
            'Community (city,town, ward)': 'Dandal Police Station, Maiduguri',
            'LGA': 'Maiduguri',
            'State': 'Borno',
            'Total Deaths': 0,
            'Boko Haram (P)': 'Boko Haram',
            'State Actor (P)': '',
            'Sectarian Actor (excluding BH) (P)': '',
            'Other Armed Actor (P)': '',
            'Kidnapper (P)': '',
            'Robber (P)': '',
            'Other (P)': '',
            'Election-related Actor (P)': '',
            'Cameroon State Actor (P)': '',
            'Boko Haram (V)': '',
            'State Actor (V)': 0,
            'Sectarian Actor (V)': '',
            'Other Armed Actor (V)': '',
            'Political Actor (V)': '',
            'Kidnapper (V)': '',
            'Kidnapee (V)': '',
            'Robber (V)': '',
            'Journalist (V)': '',
            'Civilian (V)': '',
            'Election-related Actor (V)': '',
            'Cameroon State Actor': '',
            'Bomb': 'Bomb',
            'Gun': '',
            'Machete': '',
            'Suicide Bombing': '',
            'Other Weapon': '',
            'TK': '',
            'Drinking Establishment': '',
            'Goverment Building': 'Government Building',
            'Church': '',
            'Mosque': '',
            'Bank': '',
            'School': '',
            'Other Location': '',
            'Notes': '',
            'Sources 1':
            'http://news.xinhuanet.com/english2010/world/2011-06/08/c_13915959.htm',
            'Sources 2':
            'http://www.google.com/hostednews/afp/article/ALeqM5hofvKayKKAFFtiX9-Ic5bG2ptVmg?docId=CNG.fafcacea0287fbeab90256732f165e1e.771',
            'Sources 3': '',
            'Latitude': '',
            'Longitude': '',
            'full place name': 'Maiduguri, Borno, Nigeria',
            'country': 'Nigeria',
            'file_name': 'etk/unit_tests/ground_truth/NST-Main Sheet.xlsx',
            'dataset': 'test_set'
        }]

        self.assertEqual(test_docs, expected_docs)
示例#8
0
 def test_tab_file(self):
     csv_processor = CsvProcessor(etk=etk, heading_row=1)
     test_str = 'Event ID\tEvent Date\tSource Name\tSource Sectors\tSource Country\tEvent Text\tCAMEO Code\tIntensity\tTarget Name\tTarget Sectors\tTarget Country\tStory ID\tSentence Number\tPublisher\tCity\tDistrict\tProvince\tCountry\tLatitude\tLongitude\n926685\t1995-01-01\tExtremist (Russia)\tRadicals / Extremists / Fundamentalists,Dissident\tRussian Federation\tPraise or endorse\t051\t3.4\tBoris Yeltsin\tElite,Executive,Executive Office,Government\tRussian Federation\t28235806\t5\tThe Toronto Star\tMoscow\t\tMoskva\tRussian Federation\t55.7522\t37.6156\n926687\t1995-01-01\tGovernment (Bosnia and Herzegovina)\tGovernment\tBosnia and Herzegovina\tExpress intent to cooperate\t030\t4\tCitizen (Serbia)\tGeneral Population / Civilian / Social,Social\tSerbia\t28235807\t1\tThe Toronto Star\t\t\tBosnia\tBosnia and Herzegovina\t44\t18\n926686\t1995-01-01\tCitizen (Serbia)\tGeneral Population / Civilian / Social,Social\tSerbia\tExpress intent to cooperate\t030\t4\tGovernment (Bosnia and Herzegovina)\tGovernment\tBosnia and Herzegovina\t28235807\t1\tThe Toronto Star\t\t\tBosnia\tBosnia and Herzegovina\t44\t18\n926688\t1995-01-01\tCanada\t\tCanada\tPraise or endorse\t051\t3.4\tCity Mayor (Canada)\tGovernment,Local,Municipal\tCanada\t28235809\t3\tThe Toronto Star\t\t\tOntario\tCanada\t49.2501\t-84.4998\n'
     test_docs = [
         doc.cdr_document
         for doc in csv_processor.tabular_extractor(filename='testfile.tab',
                                                    file_type='tsv',
                                                    file_content=test_str,
                                                    dataset='test_set')
     ]
     expected_docs = [{
         'Event ID': 926685,
         'Event Date': '1995-01-01',
         'Source Name': 'Extremist (Russia)',
         'Source Sectors':
         'Radicals / Extremists / Fundamentalists,Dissident',
         'Source Country': 'Russian Federation',
         'Event Text': 'Praise or endorse',
         'CAMEO Code': '051',
         'Intensity': '3.4',
         'Target Name': 'Boris Yeltsin',
         'Target Sectors': 'Elite,Executive,Executive Office,Government',
         'Target Country': 'Russian Federation',
         'Story ID': 28235806,
         'Sentence Number': 5,
         'Publisher': 'The Toronto Star',
         'City': 'Moscow',
         'District': '',
         'Province': 'Moskva',
         'Country': 'Russian Federation',
         'Latitude': '55.7522',
         'Longitude': '37.6156',
         'file_name': 'testfile.tab',
         'dataset': 'test_set'
     }, {
         'Event ID': 926687,
         'Event Date': '1995-01-01',
         'Source Name': 'Government (Bosnia and Herzegovina)',
         'Source Sectors': 'Government',
         'Source Country': 'Bosnia and Herzegovina',
         'Event Text': 'Express intent to cooperate',
         'CAMEO Code': '030',
         'Intensity': 4,
         'Target Name': 'Citizen (Serbia)',
         'Target Sectors': 'General Population / Civilian / Social,Social',
         'Target Country': 'Serbia',
         'Story ID': 28235807,
         'Sentence Number': 1,
         'Publisher': 'The Toronto Star',
         'City': '',
         'District': '',
         'Province': 'Bosnia',
         'Country': 'Bosnia and Herzegovina',
         'Latitude': 44,
         'Longitude': 18,
         'file_name': 'testfile.tab',
         'dataset': 'test_set'
     }, {
         'Event ID': 926686,
         'Event Date': '1995-01-01',
         'Source Name': 'Citizen (Serbia)',
         'Source Sectors': 'General Population / Civilian / Social,Social',
         'Source Country': 'Serbia',
         'Event Text': 'Express intent to cooperate',
         'CAMEO Code': '030',
         'Intensity': 4,
         'Target Name': 'Government (Bosnia and Herzegovina)',
         'Target Sectors': 'Government',
         'Target Country': 'Bosnia and Herzegovina',
         'Story ID': 28235807,
         'Sentence Number': 1,
         'Publisher': 'The Toronto Star',
         'City': '',
         'District': '',
         'Province': 'Bosnia',
         'Country': 'Bosnia and Herzegovina',
         'Latitude': 44,
         'Longitude': 18,
         'file_name': 'testfile.tab',
         'dataset': 'test_set'
     }, {
         'Event ID': 926688,
         'Event Date': '1995-01-01',
         'Source Name': 'Canada',
         'Source Sectors': '',
         'Source Country': 'Canada',
         'Event Text': 'Praise or endorse',
         'CAMEO Code': '051',
         'Intensity': '3.4',
         'Target Name': 'City Mayor (Canada)',
         'Target Sectors': 'Government,Local,Municipal',
         'Target Country': 'Canada',
         'Story ID': 28235809,
         'Sentence Number': 3,
         'Publisher': 'The Toronto Star',
         'City': '',
         'District': '',
         'Province': 'Ontario',
         'Country': 'Canada',
         'Latitude': '49.2501',
         'Longitude': '-84.4998',
         'file_name': 'testfile.tab',
         'dataset': 'test_set'
     }]
     self.assertEqual(test_docs, expected_docs)
示例#9
0
    def test_real_csv_file_1(self) -> None:
        csv_processor = CsvProcessor(etk=etk,
                                     heading_row=1,
                                     content_end_row=4,
                                     ends_with_blank_row=False)

        file_path = 'etk/unit_tests/ground_truth/acled_raw_data.csv'

        test_docs = [
            doc.cdr_document
            for doc in csv_processor.tabular_extractor(filename=file_path,
                                                       dataset='test_set')
        ]

        expected_docs = [{
            'data_id':
            336907,
            'iso':
            180,
            'event_id_cnty':
            'DRC11776',
            'event_id_no_cnty':
            11776,
            'event_date':
            '2018-01-13',
            'year':
            2018,
            'time_precision':
            1,
            'event_type':
            'Battle-No change of territory',
            'actor1':
            'Military Forces of Democratic Republic of Congo (2001-)',
            'assoc_actor_1':
            '',
            'inter1':
            1,
            'actor2':
            'ADF: Allied Democratic Forces',
            'assoc_actor_2':
            '',
            'inter2':
            2,
            'interaction':
            12,
            'region':
            'Central Africa',
            'country':
            'Democratic Republic of Congo',
            'admin1':
            'Nord-Kivu',
            'admin2':
            'Nord-Kivu',
            'admin3':
            'Oicha',
            'location':
            'Oicha',
            'latitude':
            '0.7',
            'longitude':
            '29.5167',
            'geo_precision':
            1,
            'source':
            'Radio Okapi',
            'source_scale':
            'Subnational',
            'notes':
            "Presumed FARDC attacked the ADF in the periphery of Oicha on January 13th. Shots were heard "
            "in the locality and it is suspected that the FARDC are attacking the 'death triangle' situated in between "
            "Mbau, Kamango and Eringeti. The reports are not confirmed by the military.",
            'fatalities':
            0,
            'timestamp':
            1516117305,
            'file_name':
            'etk/unit_tests/ground_truth/acled_raw_data.csv',
            'dataset':
            'test_set'
        }, {
            'data_id':
            336908,
            'iso':
            180,
            'event_id_cnty':
            'DRC11777',
            'event_id_no_cnty':
            11777,
            'event_date':
            '2018-01-13',
            'year':
            2018,
            'time_precision':
            1,
            'event_type':
            'Battle-No change of territory',
            'actor1':
            'Military Forces of Democratic Republic of Congo (2001-)',
            'assoc_actor_1':
            '',
            'inter1':
            1,
            'actor2':
            'ADF: Allied Democratic Forces',
            'assoc_actor_2':
            '',
            'inter2':
            2,
            'interaction':
            12,
            'region':
            'Central Africa',
            'country':
            'Democratic Republic of Congo',
            'admin1':
            'Nord-Kivu',
            'admin2':
            'Beni',
            'admin3':
            'Beni',
            'location':
            'Beni',
            'latitude':
            '0.49658',
            'longitude':
            '29.4654',
            'geo_precision':
            1,
            'source':
            'Reuters; Radio Okapi',
            'source_scale':
            'Subnational',
            'notes':
            'The FARDC launched, on January 13th, an offensive against the ADF in Beni and Lubero, '
            'in response to the recents attacks by the group. Gunfires and explosions were heard '
            'in Beni all throughout Saturday (13th).',
            'fatalities':
            0,
            'timestamp':
            1516117305,
            'file_name':
            'etk/unit_tests/ground_truth/acled_raw_data.csv',
            'dataset':
            'test_set'
        }, {
            'data_id':
            336909,
            'iso':
            180,
            'event_id_cnty':
            'DRC11778',
            'event_id_no_cnty':
            11778,
            'event_date':
            '2018-01-13',
            'year':
            2018,
            'time_precision':
            1,
            'event_type':
            'Battle-No change '
            'of territory',
            'actor1':
            'Military Forces of Democratic Republic of Congo (2001-)',
            'assoc_actor_1':
            '',
            'inter1':
            1,
            'actor2':
            'ADF: Allied Democratic Forces',
            'assoc_actor_2':
            '',
            'inter2':
            2,
            'interaction':
            12,
            'region':
            'Central Africa',
            'country':
            'Democratic Republic of Congo',
            'admin1':
            'Nord-Kivu',
            'admin2':
            'Nord-Kivu',
            'admin3':
            'Lubero',
            'location':
            'Lubero',
            'latitude':
            '-0.15867',
            'longitude':
            '29.2386',
            'geo_precision':
            1,
            'source':
            'Reuters; Radio Okapi',
            'source_scale':
            'Subnational',
            'notes':
            'The FARDC launched, on January 13th, an offensive '
            'against the ADF in Beni and Lubero, in response to the recents attacks by the group.',
            'fatalities':
            0,
            'timestamp':
            1516117305,
            'file_name':
            'etk/unit_tests/ground_truth/acled_raw_data.csv',
            'dataset':
            'test_set'
        }]

        self.assertEqual(test_docs, expected_docs)
示例#10
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.date_extractor = DateExtractor(self.etk, 'acled_date_parser')
     self.country_extractor = GlossaryExtractor(self.etk.load_glossary(
         "${GLOSSARY_PATH}/countries.json.gz", read_json=True),
                                                "country_extractor",
                                                self.etk.default_tokenizer,
                                                case_sensitive=False,
                                                ngrams=3)
     self.states_extractor = GlossaryExtractor(self.etk.load_glossary(
         "${GLOSSARY_PATH}/states_usa_canada.json.gz", read_json=True),
                                               "states_extractor",
                                               self.etk.default_tokenizer,
                                               case_sensitive=False,
                                               ngrams=3)
     self.cities_extractor = GlossaryExtractor(self.etk.load_glossary(
         "${GLOSSARY_PATH}/cities.json.gz", read_json=True),
                                               "cities_extractor",
                                               self.etk.default_tokenizer,
                                               case_sensitive=False,
                                               ngrams=3)
     self.csv_processor = CsvProcessor(etk=etk, heading_row=1)
     self.interaction_decoding_dict = {
         "10": "Sole Military Action",
         "11": "Military Versus Military",
         "12": "Military Versus Rebels",
         "13": "Military Versus Political Militia",
         "14": "Military Versus Communal Militia",
         "15": "Military Versus Rioters",
         "16": "Military Versus Protesters",
         "17": "Military Versus Civilians",
         "18": "Military Versus Other",
         "20": "Sole Rebel Action",
         "22": "Rebels Versus Rebels",
         "23": "Rebels Versus Political Militia",
         "24": "Rebels Versus Communal Militia",
         "25": "Rebels Versus Rioters",
         "26": "Rebels Versus Protesters",
         "27": "Rebels Versus Civilians",
         "28": "Rebels Versus Other",
         "30": "Sole Political Militia Action",
         "33": "Political Militia Versus Political Militia",
         "34": "Political Militia Versus Communal Militia",
         "35": "Political Militia Versus Rioters",
         "36": "Political Militia Versus Protesters",
         "37": "Political Militia Versus Civilians",
         "38": "Political Militia Versus Other",
         "40": "Sole Communal Militia Action",
         "44": "Communal Militia Versus Communal Militia",
         "45": "Communal Militia Versus Rioters",
         "46": "Communal Militia Versus Protesters",
         "47": "Communal Militia Versus Civilians",
         "48": "Communal Militia Versus Other",
         "50": "Sole Rioter Action",
         "55": "Rioters Versus Rioters",
         "56": "Rioters Versus Protesters",
         "57": "Rioters Versus Civilians",
         "58": "Rioters Versus Other",
         "60": "Sole Protester Action",
         "66": "Protesters Versus Protesters",
         "68": "Protesters Versus Other",
         "78": "Other Actor Versus Civilians",
         "80": "Sole Other Action"
     }
     self.interaction_decoder = DecodingValueExtractor(
         self.interaction_decoding_dict,
         'default_decoding',
         case_sensitive=True)
示例#11
0
    def __init__(self, etk):
        ETKModule.__init__(self, etk)

    def process_document(self, doc):
        pass


if __name__ == "__main__":
    csv_str = """text,with,Polish,non-Latin,lettes
    1,2,3,4,5,6
    a,b,c,d,e,f

    gęś,zółty,wąż,idzie,wąską,dróżką,
    ,b,c,s,w,f
    """
    etk = ETK(modules=CsvETKModule)
    cp = CsvProcessor(etk=etk,
                      heading_row=1,
                      heading_columns=(1, 3),
                      content_end_row=3,
                      ends_with_blank_row=True,
                      remove_leading_empty_rows=True,
                      required_columns=['text'])

    data_set = 'test_data_set_csv'
    docs = [
        doc.cdr_document
        for doc in cp.tabular_extractor(table_str=csv_str,
                                        dataset='test_csv_str_with_all_args')
    ]
    pprint.pprint(docs)
示例#12
0
            # for segment in doc.select_segments(jsonpath='$.notes'):
            #     doc.kg.add_value("description", segment.value)
            doc.kg.add_value("description", json_path='$.notes')

    def document_selector(self, doc) -> bool:
        """
        Boolean function for selecting document
        Args:
            doc: Document

        Returns:

        """
        return DefaultDocumentSelector().select_document(doc)


if __name__ == "__main__":

    kg_schema = KGSchema(json.load(open('master_config.json')))
    etk = ETK(modules=AcledModule, kg_schema=kg_schema)
    cp = CsvProcessor(etk=etk, heading_row=1)

    data_set = 'test_data_set_csv'
    docs = cp.tabular_extractor(filename="acled_raw_data.csv",
                                dataset='acled',
                                doc_id_field="data_id")

    results = etk.process_ems(docs[0])

    print(json.dumps(results[0].value, indent=2))
示例#13
0
                         value=doc.extract(
                             self.country_decoder,
                             doc.select_segments("$.ActorCountryCode")[0]))

        # Note: not mapping the Actor Geo codes, because Pedro doesn't understand what they mean.
        return list()


if __name__ == "__main__":

    # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema.
    kg_schema = KGSchema(json.load(open('../events_ucdp/master_config.json')))

    # Instantiate ETK, with the two processing modules and the schema.
    etk = ETK(modules=[GdeltModule, GdeltActorModule], kg_schema=kg_schema)

    # Create a CSV processor to create documents for the relevant rows in the TSV file
    cp = CsvProcessor(etk=etk,
                      heading_columns=(1, len(GdeltModule.header_fields)),
                      column_name_prefix="COL")

    with open("gdelt.jl", "w") as f:
        # Iterate over all the rows in the spredsheet
        for d in cp.tabular_extractor(filename="20170912.export_sample.tsv",
                                      dataset='gdelt'):
            for result in etk.process_ems(d):
                # print(d.cdr_document)
                # print(json.dumps(result.cdr_document.get("knowledge_graph"), indent=2))
                print(result.cdr_document.get("knowledge_graph"))
                f.write(json.dumps(result.cdr_document) + "\n")
示例#14
0
    def test_dataframe_input(self) -> None:
        csv_processor = CsvProcessor(etk=etk,
                                     heading_row=1,
                                     content_end_row=2,
                                     ends_with_blank_row=False)

        file_path = 'etk/unit_tests/ground_truth/acled_raw_data.csv'
        data = pd.read_csv(file_path)
        test_docs = [
            doc.cdr_document
            for doc in csv_processor.tabular_extractor(dataframe=data,
                                                       dataset='test_set')
        ]

        expected_docs = [{
            'data_id':
            336907,
            'iso':
            180,
            'event_id_cnty':
            'DRC11776',
            'event_id_no_cnty':
            11776,
            'event_date':
            '2018-01-13',
            'year':
            2018,
            'time_precision':
            1,
            'event_type':
            'Battle-No change of territory',
            'actor1':
            'Military Forces of Democratic Republic of Congo (2001-)',
            'assoc_actor_1':
            np.nan,
            'inter1':
            1,
            'actor2':
            'ADF: Allied Democratic Forces',
            'assoc_actor_2':
            np.nan,
            'inter2':
            2,
            'interaction':
            12,
            'region':
            'Central Africa',
            'country':
            'Democratic Republic of Congo',
            'admin1':
            'Nord-Kivu',
            'admin2':
            'Nord-Kivu',
            'admin3':
            'Oicha',
            'location':
            'Oicha',
            'latitude':
            0.7,
            'longitude':
            29.5167,
            'geo_precision':
            1,
            'source':
            'Radio Okapi',
            'source_scale':
            'Subnational',
            'notes':
            "Presumed FARDC attacked the ADF in the periphery of Oicha on January 13th. Shots were heard "
            "in the locality and it is suspected that the FARDC are attacking the 'death triangle' situated in between "
            "Mbau, Kamango and Eringeti. The reports are not confirmed by the military.",
            'fatalities':
            0,
            'timestamp':
            1516117305,
            'dataset':
            'test_set'
        }]

        self.assertEqual(test_docs, expected_docs)
示例#15
0
        return doc.cdr_document.get(
            "dataset") == "lake_chad_basin_displaced_victim"

    def process_document(self, doc: Document) -> List[Document]:
        doc.kg.add_value("size", json_path="total")
        doc.kg.add_value("type", json_path="type")
        return list()


if __name__ == "__main__":
    dir_path = sys.argv[1]
    master_config_path = sys.argv[2]
    file_name = 'lake_chad_basin_displaced.csv'
    input_path = os.path.join(dir_path, file_name)
    output_path = os.path.join(dir_path, file_name + '.jl')

    kg_schema = KGSchema(json.load(open(master_config_path)))
    etk = ETK(modules=[
        LakeChadBasinDisplacedModule, LakeChadBasinDisplacedVictimModule,
        LCBPlaceModule
    ],
              kg_schema=kg_schema)
    cp = CsvProcessor(etk=etk, heading_row=1, content_start_row=3)

    with open(output_path, "w") as f:
        print(input_path, output_path)
        for doc in cp.tabular_extractor(filename=input_path,
                                        dataset='lake_chad_basin_displaced'):
            etk.process_and_frame(doc)
            f.write(json.dumps(doc.cdr_document) + "\n")
示例#16
0
class AcledModule(ETKModule):
    def __init__(self, etk):
        ETKModule.__init__(self, etk)
        self.date_extractor = DateExtractor(self.etk, 'acled_date_parser')
        self.country_extractor = GlossaryExtractor(self.etk.load_glossary(
            "${GLOSSARY_PATH}/countries.json.gz", read_json=True),
                                                   "country_extractor",
                                                   self.etk.default_tokenizer,
                                                   case_sensitive=False,
                                                   ngrams=3)
        self.states_extractor = GlossaryExtractor(self.etk.load_glossary(
            "${GLOSSARY_PATH}/states_usa_canada.json.gz", read_json=True),
                                                  "states_extractor",
                                                  self.etk.default_tokenizer,
                                                  case_sensitive=False,
                                                  ngrams=3)
        self.cities_extractor = GlossaryExtractor(self.etk.load_glossary(
            "${GLOSSARY_PATH}/cities.json.gz", read_json=True),
                                                  "cities_extractor",
                                                  self.etk.default_tokenizer,
                                                  case_sensitive=False,
                                                  ngrams=3)
        self.csv_processor = CsvProcessor(etk=etk, heading_row=1)
        self.interaction_decoding_dict = {
            "10": "Sole Military Action",
            "11": "Military Versus Military",
            "12": "Military Versus Rebels",
            "13": "Military Versus Political Militia",
            "14": "Military Versus Communal Militia",
            "15": "Military Versus Rioters",
            "16": "Military Versus Protesters",
            "17": "Military Versus Civilians",
            "18": "Military Versus Other",
            "20": "Sole Rebel Action",
            "22": "Rebels Versus Rebels",
            "23": "Rebels Versus Political Militia",
            "24": "Rebels Versus Communal Militia",
            "25": "Rebels Versus Rioters",
            "26": "Rebels Versus Protesters",
            "27": "Rebels Versus Civilians",
            "28": "Rebels Versus Other",
            "30": "Sole Political Militia Action",
            "33": "Political Militia Versus Political Militia",
            "34": "Political Militia Versus Communal Militia",
            "35": "Political Militia Versus Rioters",
            "36": "Political Militia Versus Protesters",
            "37": "Political Militia Versus Civilians",
            "38": "Political Militia Versus Other",
            "40": "Sole Communal Militia Action",
            "44": "Communal Militia Versus Communal Militia",
            "45": "Communal Militia Versus Rioters",
            "46": "Communal Militia Versus Protesters",
            "47": "Communal Militia Versus Civilians",
            "48": "Communal Militia Versus Other",
            "50": "Sole Rioter Action",
            "55": "Rioters Versus Rioters",
            "56": "Rioters Versus Protesters",
            "57": "Rioters Versus Civilians",
            "58": "Rioters Versus Other",
            "60": "Sole Protester Action",
            "66": "Protesters Versus Protesters",
            "68": "Protesters Versus Other",
            "78": "Other Actor Versus Civilians",
            "80": "Sole Other Action"
        }
        self.interaction_decoder = DecodingValueExtractor(
            self.interaction_decoding_dict,
            'default_decoding',
            case_sensitive=True)

    def process_document(self, cdr_doc: Document):
        new_docs = list()

        cdr_doc_json = cdr_doc.cdr_document
        if 'raw_content_path' in cdr_doc_json and cdr_doc_json[
                'raw_content_path'].strip() != '':
            try:
                docs = self.csv_processor.tabular_extractor(
                    filename=cdr_doc_json['raw_content_path'],
                    dataset='acleddata',
                    doc_id_field="data_id")
                for doc in docs:
                    doc_json = doc.cdr_document
                    event_date = doc.select_segments(jsonpath='$.event_date')
                    for segment in event_date:
                        extractions = doc.extract(
                            extractor=self.date_extractor, extractable=segment)

                        for extraction in extractions:
                            doc.kg.add_value("event_date",
                                             value=extraction.value)

                    doc.kg.add_value("website", value='acleddata.com')
                    doc.kg.add_value("description", json_path='$.notes')
                    acled_title = "{event_date}: {event_type} in {location}".format(
                        event_date=doc.cdr_document.get("event_date", ''),
                        event_type=doc.cdr_document.get("event_type", ''),
                        location=doc.cdr_document.get("location", ''))
                    doc.kg.add_value("title", value=acled_title)

                    doc.kg.add_value('country', json_path="$.country")

                    states_segments = doc.select_segments("$.state")
                    for state_segment in states_segments:
                        extracted_states = doc.extract(self.states_extractor,
                                                       state_segment)
                        doc.kg.add_value("state", value=extracted_states)

                    interaction_segments = doc.select_segments("$.interaction")
                    for interaction_segment in interaction_segments:
                        extracted_interaction = doc.extract(
                            self.interaction_decoder, interaction_segment)
                        doc.kg.add_value("type", value=extracted_interaction)

                    doc.kg.add_value("type", json_path="$.event_type")
                    doc.kg.add_value("type", json_path="$.type")
                    doc.kg.add_value("type", value="Event")

                    location_segments = doc.select_segments("$.location")
                    for location_segment in location_segments:
                        ec_location = doc.extract(self.country_extractor,
                                                  location_segment)
                        doc.kg.add_value("country", ec_location)

                        ecity_location = doc.extract(self.cities_extractor,
                                                     location_segment)
                        doc.kg.add_value("city_name", ecity_location)

                        es_location = doc.extract(self.states_extractor,
                                                  location_segment)
                        doc.kg.add_value("state", es_location)

                    new_docs.append(doc)
            except Exception as e:
                raise Exception('Error in AcledModule', e)
        return new_docs

    def document_selector(self, doc) -> bool:
        """
        Boolean function for selecting document
        Args:
            doc: Document

        Returns:

        """
        return doc.cdr_document.get("dataset") == "acleddata"
示例#17
0

if __name__ == "__main__":

    # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema.
    kg_schema = KGSchema(json.load(open('master_config.json')))

    # Instantiate ETK, with the two processing modules and the schema.
    etk = ETK(modules=[
        GTDModule, GTDDamageModule, GTDInjuriesModule, GTDFatalitiesModule,
        GTDWeaponsModule, GTDActorModule, GTDVictimModule, GTDPlaceModule
    ],
              kg_schema=kg_schema)

    # Create a CSV processor to create documents for the relevant rows in the Excel sheet
    cp = CsvProcessor(etk=etk, heading_row=1)

    with open("gtd.jl", "w") as f:
        # Iterate over all the rows in the spredsheet
        for doc in cp.tabular_extractor(
                filename="globalterrorismdb_0617dist-nigeria.csv",
                dataset='gtd'):
            # print(json.dumps(doc.value, indent=2))
            # exit(0)
            etk.process_and_frame(doc)
            f.write(json.dumps(doc.cdr_document) + "\n")
            # print(json.dumps(doc.value, indent=2))
            # exit(0)
            # for result in etk.process_ems(doc):
            #     print(json.dumps(result.cdr_document["knowledge_graph"], indent=2))
            #     exit(0)
示例#18
0
        ETKModule.__init__(self, etk)

    def process_document(self, doc):
        pass


if __name__ == "__main__":
    csv_str = """text,with,Polish,non-Latin,lettes
    1,2,3,4,5,6
    a,b,c,d,e,f
    """
    file_path = 'etk/unit_tests/ground_truth/queryResults.csv'
    jl_file_path = 'etk/unit_tests/ground_truth/queryResults.jl'
    etk = ETK(modules=CsvETKModule)
    csv_processor = CsvProcessor(etk=etk,
                                 heading_row=2,
                                 content_start_row=10,
                                 content_end_row=1723)

    data_set = 'test_data_set_csv'
    test_docs = [
        doc.cdr_document
        for doc in csv_processor.tabular_extractor(filename=file_path,
                                                   dataset='test_set')
    ]
    """docs = [doc.cdr_document for doc in
                 cp.tabular_extractor(table_str=filename, data_set='test_csv_str_with_all_args')]"""
    news_path = 'etk/unit_tests/ground_truth/queryResults.jl'
    news_data = open(news_path, 'w')

    news_data.write(json.dumps(test_docs))
    #pprint.pprint(test_docs)
示例#19
0
        # Add a title to the actor document
        doc.kg.add_value("title", json_path="$.Side")

        # Return an empty list because we didn't create new documents
        return []


# The main is for testing, and is not used in the DIG pipeline
if __name__ == "__main__":

    # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema.
    kg_schema = KGSchema(json.load(open('master_config.json')))

    # Instantiate ETK, with the two processing modules and the schema.
    etk = ETK(modules=[UCDPModule, UCDPActorModule], kg_schema=kg_schema)

    # Create a CSV processor to create documents for the relevant rows in the Excel sheet
    cp = CsvProcessor(etk=etk, heading_row=1)

    with open("ucdp.jl", "w") as f:
        # Iterate over all the rows in the spredsheet
        for doc in cp.tabular_extractor(filename="ucdp_sample.xls", dataset='ucdp'):
            # Each row produces a document, which we sent to ETK.
            # Note that each invocation of process_ems will also process any new documents created while
            # processing each doc
            etk.process_and_frame(doc)
            f.write(json.dumps(doc.cdr_document) + "\n")
            # for result in etk.process_ems(doc):
            #     # print(result.cdr_document["knowledge_graph"])
            #     f.write(json.dumps(result.cdr_document) + "\n")
示例#20
0

def convert_plus_to_list(value):
    vals = value.split('+')
    vals = [val.strip() for val in vals]
    for val in vals:
        if 'has_topic' in val:
            vals.remove(val)
    return vals


if __name__ == "__main__":
    parser = OptionParser()

    (c_options, args) = parser.parse_args()
    input_path = args[0]
    output_path = args[1]

    etk = ETK(modules=CsvETKModule)
    csv_processor = CsvProcessor(etk=etk, heading_row=1)

    data_set = 'elicit_gdelt_mapping'
    parsed_docs = [
        doc.cdr_document
        for doc in csv_processor.tabular_extractor(filename=input_path,
                                                   dataset='elicit_mapping')
    ]

    open(output_path,
         'w').write(json.dumps(parse_gdelt_mapping(parsed_docs), indent=2))