def load_into_pandas(self, input_file, regen: bool =False): """ intput_file can be filename or fd load csv mptpcp data into panda :param regen Ignore the cache and regenerate any cached csv file from the input pcap Returns: a panda.DataFrame """ log.debug("Asked to load %s" % input_file) filename = os.path.expanduser(input_file) filename = os.path.realpath(filename) csv_filename = self.get_matching_csv_filename(filename, regen) temp = mp.get_fields("fullname", "type") dtypes = {k: v for k, v in temp.items() if v is not None} log.debug("Loading a csv file %s" % csv_filename) data = pd.read_csv(csv_filename, sep=self.config["DEFAULT"]["delimiter"], dtype=dtypes, converters={ "tcp.flags": lambda x: int(x, 16), } ) data.rename(inplace=True, columns=mp.get_fields("fullname", "name")) # pp = pprint.PrettyPrinter(indent=4) # log.debug("Dtypes after load:%s\n" % pp.pformat(data.dtypes)) return data
def get_matching_csv_filename(self, filename, force_regen : bool): """ Name is bad, since the function can generate the file if required Expects a realpath as filename Accept either a .csv or a .pcap file Returns realpath towards resulting csv filename """ realpath = filename basename, ext = os.path.splitext(realpath) # print("Basename=%s" % basename) # csv_filename = filename if ext == ".csv": log.debug("Filename already has a .csv extension") csv_filename = realpath else: print("%s format is not supported as is. Needs to be converted first" % (filename)) def matching_cache_filename(filename): """ Expects a realpath else """ # create a list of path elements (separated by system separator '/' or '\' # from the absolute filename l = os.path.realpath(filename).split(os.path.sep) res = os.path.join(self.config["DEFAULT"]["cache"], '%'.join(l)) _, ext = os.path.splitext(filename) if ext != ".csv": res += ".csv" return res # csv_filename = filename + ".csv" # str(Filetype.csv.value) csv_filename = matching_cache_filename(realpath) cache_is_invalid = True log.debug("Checking for %s" % csv_filename) if os.path.isfile(csv_filename): log.info("A cache %s was found" % csv_filename) ctime_cached = os.path.getctime(csv_filename) ctime_pcap = os.path.getctime(filename) # print(ctime_cached , " vs ", ctime_pcap) if ctime_cached > ctime_pcap: log.debug("Cache seems valid") cache_is_invalid = False else: log.debug("Cache seems outdated") # if matching csv does not exist yet or if generation forced if force_regen or cache_is_invalid: # recursively create the directories log.debug("Creating cache directory [%s]" % self.config["DEFAULT"]["cache"]) os.makedirs(self.config["DEFAULT"]["cache"], exist_ok=True) log.info("Preparing to convert %s into %s" % (filename, csv_filename)) exporter = TsharkExporter( self.config["DEFAULT"]["tshark_binary"], self.config["DEFAULT"]["delimiter"], self.config["DEFAULT"]["wireshark_profile"], ) retcode, stderr = exporter.export_to_csv( filename, csv_filename, mp.get_fields("fullname", "name"), tshark_filter="mptcp and not icmp" ) log.info("exporter exited with code=", retcode) if retcode: raise Exception(stderr) return csv_filename