def test_filter_rows(self): """ Test several cases for filtering rows by column value """ # One column, values of different types df = pandas.DataFrame() filepaths = [ '', None, '-', '/file/path', 1, True, pandas.np.nan, '-', [1, 2] ] df["filepath"] = filepaths data_filtered = FilterRows(df) df = data_filtered.filter_(["filepath"], "-") self.assertEqual(len(df), 7) # One empty column df = pandas.DataFrame() df["filepath"] = [] data_filtered = FilterRows(df) df = data_filtered.filter_(["filepath"], "-") self.assertEqual(len(df), 0) # Several columns and just one empty df = pandas.DataFrame() df["filepath"] = [] df["name"] = ["name", "-", "other", "-"] df["dirname"] = ["dir", "-", "-", "-"] data_filtered = FilterRows(df) df = data_filtered.filter_(["filepath", "name", "dirname"], "-") self.assertEqual(len(df), 1)
def test_column_not_exists(self): """ Test empty dataframe looking for the corresponding ValueError exception """ df = pandas.DataFrame() data_filtered = FilterRows(df) with self.assertRaisesRegex( ValueError, "Column filepath not in DataFrame columns: \[\]") as context: data_filtered.filter_(["filepath"], "-")
def eventize_and_enrich(commits, git_enrich): logging.info("New commits: " + str(len(commits))) # Create events from commits # TODO add tests for eventize method git_events = Git(commits, git_enrich) events_df = git_events.eventize(2) logging.info("New events: " + str(len(events_df))) # Filter information data_filtered = FilterRows(events_df) events_df = data_filtered.filter_(["filepath"], "-") logging.info("New events filtered: " + str(len(events_df))) # Add filetype info enriched_filetype = FileType(events_df) events_df = enriched_filetype.enrich('filepath') logging.info("New Filetype events: " + str(len(events_df))) # Split filepath info enriched_filepath = FilePath(events_df) events_df = enriched_filepath.enrich('filepath') logging.info("New Filepath events: " + str(len(events_df))) # Deal with surrogates convert = ToUTF8(events_df) events_df = convert.enrich(["owner"]) logging.info("Final new events: " + str(len(events_df))) return events_df
def process(self, items_block): """Process items to add file related information. Eventize items creating one new item per each file found in the commit (excluding files with no actions performed on them). For each event, file path, file name, path parts, file type and file extension are added as fields. :param items_block: items to be processed. Expects to find ElasticSearch hits _source part only. """ logger.debug("{} New commits: {}".format(self.__log_prefix, len(items_block))) # Create events from commits git_events = Git(items_block, self._git_enrich) events_df = git_events.eventize(2) logger.debug("{} New events: {}".format(self.__log_prefix, len(events_df))) if len(events_df) > 0: # Filter information data_filtered = FilterRows(events_df) events_df = data_filtered.filter_(["filepath"], "-") logger.debug("{} New events filtered: {}".format( self.__log_prefix, len(events_df))) events_df['message'] = events_df['message'].str.slice( stop=AreasOfCode.MESSAGE_MAX_SIZE) logger.debug("{} Remove message content".format(self.__log_prefix)) # Add filetype info enriched_filetype = FileType(events_df) events_df = enriched_filetype.enrich('filepath') logger.debug("{} New Filetype events: {}".format( self.__log_prefix, len(events_df))) # Split filepath info enriched_filepath = FilePath(events_df) events_df = enriched_filepath.enrich('filepath') logger.debug("{} New Filepath events: {}".format( self.__log_prefix, len(events_df))) events_df['origin'] = events_df['repository'] # Deal with surrogates convert = ToUTF8(events_df) events_df = convert.enrich(["owner"]) logger.debug("{} Final new events: {}".format(self.__log_prefix, len(events_df))) return self.ProcessResults(processed=len(events_df), out_items=events_df)
def process(self, items_block): """Process items to add file related information. Eventize items creating one new item per each file found in the commit (excluding files with no actions performed on them). For each event, file path, file name, path parts, file type and file extension are added as fields. :param items_block: items to be processed. Expects to find ElasticSearch hits _source part only. """ logger.info("New commits: " + str(len(items_block))) # Create events from commits git_events = Git(items_block, self._git_enrich) events_df = git_events.eventize(2) logger.info("New events: " + str(len(events_df))) if len(events_df) > 0: # Filter information data_filtered = FilterRows(events_df) events_df = data_filtered.filter_(["filepath"], "-") logger.info("New events filtered: " + str(len(events_df))) # Add filetype info enriched_filetype = FileType(events_df) events_df = enriched_filetype.enrich('filepath') logger.info("New Filetype events: " + str(len(events_df))) # Split filepath info enriched_filepath = FilePath(events_df) events_df = enriched_filepath.enrich('filepath') logger.info("New Filepath events: " + str(len(events_df))) # Deal with surrogates convert = ToUTF8(events_df) events_df = convert.enrich(["owner"]) logger.info("Final new events: " + str(len(events_df))) return self.ProcessResults(processed=len(events_df), out_items=events_df)