def eventize_and_enrich(commits, git_enrich): logging.info("New commits: " + str(len(commits))) # Create events from commits # TODO add tests for eventize method git_events = Git(commits, git_enrich) events_df = git_events.eventize(2) logging.info("New events: " + str(len(events_df))) # Filter information data_filtered = FilterRows(events_df) events_df = data_filtered.filter_(["filepath"], "-") logging.info("New events filtered: " + str(len(events_df))) # Add filetype info enriched_filetype = FileType(events_df) events_df = enriched_filetype.enrich('filepath') logging.info("New Filetype events: " + str(len(events_df))) # Split filepath info enriched_filepath = FilePath(events_df) events_df = enriched_filepath.enrich('filepath') logging.info("New Filepath events: " + str(len(events_df))) # Deal with surrogates convert = ToUTF8(events_df) events_df = convert.enrich(["owner"]) logging.info("Final new events: " + str(len(events_df))) return events_df
def process(self, items_block): """Process items to add file related information. Eventize items creating one new item per each file found in the commit (excluding files with no actions performed on them). For each event, file path, file name, path parts, file type and file extension are added as fields. :param items_block: items to be processed. Expects to find ElasticSearch hits _source part only. """ logger.debug("{} New commits: {}".format(self.__log_prefix, len(items_block))) # Create events from commits git_events = Git(items_block, self._git_enrich) events_df = git_events.eventize(2) logger.debug("{} New events: {}".format(self.__log_prefix, len(events_df))) if len(events_df) > 0: # Filter information data_filtered = FilterRows(events_df) events_df = data_filtered.filter_(["filepath"], "-") logger.debug("{} New events filtered: {}".format( self.__log_prefix, len(events_df))) events_df['message'] = events_df['message'].str.slice( stop=AreasOfCode.MESSAGE_MAX_SIZE) logger.debug("{} Remove message content".format(self.__log_prefix)) # Add filetype info enriched_filetype = FileType(events_df) events_df = enriched_filetype.enrich('filepath') logger.debug("{} New Filetype events: {}".format( self.__log_prefix, len(events_df))) # Split filepath info enriched_filepath = FilePath(events_df) events_df = enriched_filepath.enrich('filepath') logger.debug("{} New Filepath events: {}".format( self.__log_prefix, len(events_df))) events_df['origin'] = events_df['repository'] # Deal with surrogates convert = ToUTF8(events_df) events_df = convert.enrich(["owner"]) logger.debug("{} Final new events: {}".format(self.__log_prefix, len(events_df))) return self.ProcessResults(processed=len(events_df), out_items=events_df)
def process(self, items_block): """Process items to add file related information. Eventize items creating one new item per each file found in the commit (excluding files with no actions performed on them). For each event, file path, file name, path parts, file type and file extension are added as fields. :param items_block: items to be processed. Expects to find ElasticSearch hits _source part only. """ logger.info("New commits: " + str(len(items_block))) # Create events from commits git_events = Git(items_block, self._git_enrich) events_df = git_events.eventize(2) logger.info("New events: " + str(len(events_df))) if len(events_df) > 0: # Filter information data_filtered = FilterRows(events_df) events_df = data_filtered.filter_(["filepath"], "-") logger.info("New events filtered: " + str(len(events_df))) # Add filetype info enriched_filetype = FileType(events_df) events_df = enriched_filetype.enrich('filepath') logger.info("New Filetype events: " + str(len(events_df))) # Split filepath info enriched_filepath = FilePath(events_df) events_df = enriched_filepath.enrich('filepath') logger.info("New Filepath events: " + str(len(events_df))) # Deal with surrogates convert = ToUTF8(events_df) events_df = convert.enrich(["owner"]) logger.info("Final new events: " + str(len(events_df))) return self.ProcessResults(processed=len(events_df), out_items=events_df)
def test_FilePath(self): """ Test FilePath enricher""" # Empty test empty_df = pandas.DataFrame() filepath = FilePath(empty_df) enriched_df = filepath.enrich('filepath') self.assertTrue(enriched_df.empty) # test_df = pandas.DataFrame() file_1 = {} file_2 = {} file_3 = {} file_4 = {} file_5 = {} file_6 = {} file_1['filepath'] = 'file.txt' file_1['file_name'] = 'file.txt' file_1['file_ext'] = 'txt' file_1['file_dir_name'] = '/' file_1['file_path_list'] = ['file.txt'] file_2['filepath'] = '/foo/bar' file_2['file_name'] = 'bar' file_2['file_ext'] = '' file_2['file_dir_name'] = '/foo/' file_2['file_path_list'] = ['foo', 'bar'] file_3['filepath'] = '/foo/bar/file.txt' file_3['file_name'] = 'file.txt' file_3['file_ext'] = 'txt' file_3['file_dir_name'] = '/foo/bar/' file_3['file_path_list'] = ['foo', 'bar', 'file.txt'] file_4['filepath'] = '/foo/bar/' file_4['file_name'] = '' file_4['file_ext'] = '' file_4['file_dir_name'] = '/foo/bar/' file_4['file_path_list'] = ['foo', 'bar'] file_5['filepath'] = '/foo//bar.txt' file_5['file_name'] = 'bar.txt' file_5['file_ext'] = 'txt' file_5['file_dir_name'] = '/foo/' file_5['file_path_list'] = ['foo', 'bar.txt'] file_6['filepath'] = '//foo///bar.txt' file_6['file_name'] = 'bar.txt' file_6['file_ext'] = 'txt' file_6['file_dir_name'] = '/foo/' file_6['file_path_list'] = ['foo', 'bar.txt'] test_df['filepath'] = [ file_1['filepath'], file_2['filepath'], file_3['filepath'], file_4['filepath'], file_5['filepath'], file_6['filepath'] ] filepath = FilePath(test_df) enriched_df = filepath.enrich('filepath') self.assertEqual(enriched_df.iloc[[0]]['filepath'].item(), file_1['filepath']) self.assertEqual(enriched_df.iloc[[1]]['filepath'].item(), file_2['filepath']) self.assertEqual(enriched_df.iloc[[2]]['filepath'].item(), file_3['filepath']) self.assertEqual(enriched_df.iloc[[3]]['filepath'].item(), file_4['filepath']) self.assertEqual(enriched_df.iloc[[4]]['filepath'].item(), file_5['filepath']) self.assertEqual(enriched_df.iloc[[5]]['filepath'].item(), file_6['filepath']) self.assertEqual(enriched_df.iloc[[0]]['file_name'].item(), file_1['file_name']) self.assertEqual(enriched_df.iloc[[1]]['file_name'].item(), file_2['file_name']) self.assertEqual(enriched_df.iloc[[2]]['file_name'].item(), file_3['file_name']) self.assertEqual(enriched_df.iloc[[3]]['file_name'].item(), file_4['file_name']) self.assertEqual(enriched_df.iloc[[4]]['file_name'].item(), file_5['file_name']) self.assertEqual(enriched_df.iloc[[5]]['file_name'].item(), file_6['file_name']) self.assertEqual(enriched_df.iloc[[0]]['file_ext'].item(), file_1['file_ext']) self.assertEqual(enriched_df.iloc[[1]]['file_ext'].item(), file_2['file_ext']) self.assertEqual(enriched_df.iloc[[2]]['file_ext'].item(), file_3['file_ext']) self.assertEqual(enriched_df.iloc[[3]]['file_ext'].item(), file_4['file_ext']) self.assertEqual(enriched_df.iloc[[4]]['file_ext'].item(), file_5['file_ext']) self.assertEqual(enriched_df.iloc[[5]]['file_ext'].item(), file_6['file_ext']) self.assertEqual(enriched_df.iloc[[0]]['file_dir_name'].item(), file_1['file_dir_name']) self.assertEqual(enriched_df.iloc[[1]]['file_dir_name'].item(), file_2['file_dir_name']) self.assertEqual(enriched_df.iloc[[2]]['file_dir_name'].item(), file_3['file_dir_name']) self.assertEqual(enriched_df.iloc[[3]]['file_dir_name'].item(), file_4['file_dir_name']) self.assertEqual(enriched_df.iloc[[4]]['file_dir_name'].item(), file_5['file_dir_name']) self.assertEqual(enriched_df.iloc[[5]]['file_dir_name'].item(), file_6['file_dir_name']) self.assertEqual(enriched_df.iloc[[0]]['file_path_list'].item(), file_1['file_path_list']) self.assertEqual(enriched_df.iloc[[1]]['file_path_list'].item(), file_2['file_path_list']) self.assertEqual(enriched_df.iloc[[2]]['file_path_list'].item(), file_3['file_path_list']) self.assertEqual(enriched_df.iloc[[3]]['file_path_list'].item(), file_4['file_path_list']) self.assertEqual(enriched_df.iloc[[4]]['file_path_list'].item(), file_5['file_path_list']) self.assertEqual(enriched_df.iloc[[5]]['file_path_list'].item(), file_6['file_path_list'])
def test_FilePath(self): """ Test FilePath enricher""" # Empty test empty_df = pandas.DataFrame() filepath = FilePath(empty_df) enriched_df = filepath.enrich('filepath') self.assertTrue(enriched_df.empty) # test_df = pandas.DataFrame() file_1 = {} file_2 = {} file_3 = {} file_4 = {} file_5 = {} file_6 = {} file_1['filepath'] = 'file.txt' file_1['file_name'] = 'file.txt' file_1['file_ext'] = 'txt' file_1['file_dir_name'] = '/' file_1['file_path_list'] = ['file.txt'] file_2['filepath'] = '/foo/bar' file_2['file_name'] = 'bar' file_2['file_ext'] = '' file_2['file_dir_name'] = '/foo/' file_2['file_path_list'] = ['foo', 'bar'] file_3['filepath'] = '/foo/bar/file.txt' file_3['file_name'] = 'file.txt' file_3['file_ext'] = 'txt' file_3['file_dir_name'] = '/foo/bar/' file_3['file_path_list'] = ['foo', 'bar', 'file.txt'] file_4['filepath'] = '/foo/bar/' file_4['file_name'] = '' file_4['file_ext'] = '' file_4['file_dir_name'] = '/foo/bar/' file_4['file_path_list'] = ['foo', 'bar'] file_5['filepath'] = '/foo//bar.txt' file_5['file_name'] = 'bar.txt' file_5['file_ext'] = 'txt' file_5['file_dir_name'] = '/foo/' file_5['file_path_list'] = ['foo', 'bar.txt'] file_6['filepath'] = '//foo///bar.txt' file_6['file_name'] = 'bar.txt' file_6['file_ext'] = 'txt' file_6['file_dir_name'] = '/foo/' file_6['file_path_list'] = ['foo', 'bar.txt'] test_df['filepath'] = [file_1['filepath'], file_2['filepath'], file_3['filepath'], file_4['filepath'], file_5['filepath'], file_6['filepath']] filepath = FilePath(test_df) enriched_df = filepath.enrich('filepath') self.assertEqual(enriched_df.iloc[[0]]['filepath'].item(), file_1['filepath']) self.assertEqual(enriched_df.iloc[[1]]['filepath'].item(), file_2['filepath']) self.assertEqual(enriched_df.iloc[[2]]['filepath'].item(), file_3['filepath']) self.assertEqual(enriched_df.iloc[[3]]['filepath'].item(), file_4['filepath']) self.assertEqual(enriched_df.iloc[[4]]['filepath'].item(), file_5['filepath']) self.assertEqual(enriched_df.iloc[[5]]['filepath'].item(), file_6['filepath']) self.assertEqual(enriched_df.iloc[[0]]['file_name'].item(), file_1['file_name']) self.assertEqual(enriched_df.iloc[[1]]['file_name'].item(), file_2['file_name']) self.assertEqual(enriched_df.iloc[[2]]['file_name'].item(), file_3['file_name']) self.assertEqual(enriched_df.iloc[[3]]['file_name'].item(), file_4['file_name']) self.assertEqual(enriched_df.iloc[[4]]['file_name'].item(), file_5['file_name']) self.assertEqual(enriched_df.iloc[[5]]['file_name'].item(), file_6['file_name']) self.assertEqual(enriched_df.iloc[[0]]['file_ext'].item(), file_1['file_ext']) self.assertEqual(enriched_df.iloc[[1]]['file_ext'].item(), file_2['file_ext']) self.assertEqual(enriched_df.iloc[[2]]['file_ext'].item(), file_3['file_ext']) self.assertEqual(enriched_df.iloc[[3]]['file_ext'].item(), file_4['file_ext']) self.assertEqual(enriched_df.iloc[[4]]['file_ext'].item(), file_5['file_ext']) self.assertEqual(enriched_df.iloc[[5]]['file_ext'].item(), file_6['file_ext']) self.assertEqual(enriched_df.iloc[[0]]['file_dir_name'].item(), file_1['file_dir_name']) self.assertEqual(enriched_df.iloc[[1]]['file_dir_name'].item(), file_2['file_dir_name']) self.assertEqual(enriched_df.iloc[[2]]['file_dir_name'].item(), file_3['file_dir_name']) self.assertEqual(enriched_df.iloc[[3]]['file_dir_name'].item(), file_4['file_dir_name']) self.assertEqual(enriched_df.iloc[[4]]['file_dir_name'].item(), file_5['file_dir_name']) self.assertEqual(enriched_df.iloc[[5]]['file_dir_name'].item(), file_6['file_dir_name']) self.assertEqual(enriched_df.iloc[[0]]['file_path_list'].item(), file_1['file_path_list']) self.assertEqual(enriched_df.iloc[[1]]['file_path_list'].item(), file_2['file_path_list']) self.assertEqual(enriched_df.iloc[[2]]['file_path_list'].item(), file_3['file_path_list']) self.assertEqual(enriched_df.iloc[[3]]['file_path_list'].item(), file_4['file_path_list']) self.assertEqual(enriched_df.iloc[[4]]['file_path_list'].item(), file_5['file_path_list']) self.assertEqual(enriched_df.iloc[[5]]['file_path_list'].item(), file_6['file_path_list'])