Пример #1
0
 def __init__(self, loader: TsharkConfig=None, **kwargs) -> None:
     super().__init__(df_name=kwargs.get("dest"), **kwargs)
     # if loader == None:
     self.loader = loader or TsharkConfig()
     completer_method = functools.partial(cmd2.Cmd.path_complete, path_filter=lambda path: os.path.isfile(path))
     setattr(self, ATTR_CHOICES_CALLABLE,
             ChoicesCallable(is_method=True, is_completer=True, to_call=completer_method,))
Пример #2
0
 def __init__(
     self,
     name: str,
     protocol: Protocol,
     loader=None,
     **kwargs
 ) -> None:
     """
     """
     self.loader = loader or TsharkConfig()
     self.protocol = protocol
     DataframeAction.__init__(self, df_name=name, **kwargs)
Пример #3
0
def main():

    # https://docs.python.org/3/library/argparse.html#module-argparse
    # http://tricksntweaks.blogspot.be/2013/05/advance-argument-parsing-in-python.html
    parser = argparse.ArgumentParser(
        description='Generate MPTCP stats & plots',
        fromfile_prefix_chars='@',
    )
    parser.add_argument('--relative',
                        action="store_true",
                        help="set to export relative TCP seq number")
    parser.add_argument('--tshark',
                        dest="tshark_exe",
                        action="store",
                        default="tshark",
                        help="Path to shark binary")
    parser.add_argument(
        '--profile',
        dest="tshark_exe",
        action="store",
        default=None,
        help="Wireshark profile which contains many options to customize output"
    )

    # TODO tshark.py devrait plutot accepter des streams
    # argparse.FileType('r')
    # parser.add_argument('xpconfig', default="tests.ini", action="store", type=str,  help="Config filename. Describe experiment settings")

    # parser.add_argument('inputPcap', action="store", help="src IP")

    pcap_parser = argparse.ArgumentParser(
        description='Expecting pcap file as input',
        add_help=False,
    )
    pcap_parser.add_argument('inputPcap', action="store", help="Input pcap")

    subparsers = parser.add_subparsers(dest="subparser_name",
                                       title="Subparsers",
                                       help='sub-command help')

    subparser_csv = subparsers.add_parser('pcap2csv',
                                          parents=[pcap_parser],
                                          help='Converts pcap to a csv file')
    # subparser_csv.add_argument('inputPcap', action="store", help="Input pcap")
    subparser_csv.add_argument('--output',
                               "-o",
                               action="store",
                               help="csv filename")
    subparser_csv.add_argument('--filter',
                               "-f",
                               action="store",
                               help="Filter",
                               default="")
    subparser_csv.add_argument(
        'fields_filename',
        type=argparse.FileType('r'),
        action="store",
        help="json file mapping name to their wireshark name")

    # List MPTCP connections and subflows
    sp_csv2sql = subparsers.add_parser(
        'csv2sql', help='Imports csv file to an sqlite database')
    sp_csv2sql.add_argument('inputCsv', action="store", help="Input Csv")
    sp_csv2sql.add_argument('output',
                            nargs="?",
                            action="store",
                            help="db filename")

    sp_pcap2sql = subparsers.add_parser(
        'pcap2sql', help='Converts pcap to an sqlite database')
    sp_pcap2sql.add_argument('inputPcap', action="store", help="Input pcap")
    sp_pcap2sql.add_argument('output',
                             nargs="?",
                             action="store",
                             help="db filename")

    args = parser.parse_args(sys.argv[1:])

    exporter = TsharkConfig(tshark_exe, profile=args.profile)
    # exporter.tcp_relative_seq = args.relative if args.relative else True
    exporter.tcp_relative_seq = args.relative
    # exporter.fields_to_export = fields_to_export

    log.debug("Relative #seq = %s" % exporter.tcp_relative_seq)
    if args.subparser_name == "pcap2csv":
        inputFilename = args.inputPcap
        outputFilename = args.output if args.output else get_basename(
            inputFilename, "csv")
        fields_to_export = load_fields_to_export_from_file(
            args.fields_filename)
        exporter.filter = args.filter
        print(fields_to_export)
        exporter.export_pcap_to_csv(inputFilename, outputFilename,
                                    fields_to_export)
    elif args.subparser_name == "csv2sql":
        inputFilename = args.inputCsv
        outputFilename = get_basename(inputFilename, "sqlite")
        convert_csv_to_sql(inputFilename, outputFilename, "connections")
    elif args.subparser_name == "pcap2sql":
        inputFilename = args.inputPcap
        outputFilename = get_basename(inputFilename, "sqlite")
        exporter.export_pcap_to_sql(inputFilename, outputFilename)
    else:
        parser.print_help()
Пример #4
0
    def __init__(self, cfg: MpTcpAnalyzerConfig, stdin=sys.stdin, **kwargs) -> None:
        """
        Args:
            cfg (MpTcpAnalyzerConfig): A valid configuration

        Attributes:
            prompt (str): Prompt seen by the user, displays currently loaded pcpa
            config: configution to get user parameters
            data:  dataframe currently in use
        """

        self.shortcuts.update({
            'lc': 'list_connections',
            'ls': 'list_subflows',
            'lr': 'list_reinjections'
        })
        super().__init__(completekey='tab', stdin=stdin)
        self.prompt = FG_COLORS['blue'] + "Ready>"  + color_off
        self.data = None  # type: pd.DataFrame
        self.config = cfg
        self.tshark_config = TsharkConfig(
            delimiter=cfg["mptcpanalyzer"]["delimiter"],
            profile=cfg["mptcpanalyzer"]["wireshark_profile"],
        )

        # cmd2 specific initialization
        self.abbrev = True  # when no ambiguities, run the command
        self.allow_cli_args = True  # disable autoload of transcripts
        self.allow_redirection = True  # allow pipes in commands
        self.default_to_shell = False
        self.debug = True  # for now
        self.set_posix_shlex = True  # need cmd2 >= 0.8

        #  Load Plots
        ######################
        # you can  list available plots under the namespace
        # https://pypi.python.org/pypi/entry_point_inspector
        # https://docs.openstack.org/stevedore/latest/reference/index.html#stevedore.extension.ExtensionManager
        # mgr = driver.DriverManager(
        self.plot_mgr = extension.ExtensionManager(
            namespace='mptcpanalyzer.plots',
            invoke_on_load=True,
            verify_requirements=True,
            invoke_args=(self.tshark_config,),
            # invoke_kwds
            propagate_map_exceptions=True,
            on_load_failure_callback=self.stevedore_error_handler
        )

        self.cmd_mgr = extension.ExtensionManager(
            namespace='mptcpanalyzer.cmds',
            invoke_on_load=True,
            verify_requirements=True,
            invoke_args=(),
            propagate_map_exceptions=False,
            on_load_failure_callback=self.stevedore_error_handler
        )

        #  do_plot parser
        ######################
        # not my first choice but to accomodate cmd2 constraints
        # see https://github.com/python-cmd2/cmd2/issues/498
        subparsers = MpTcpAnalyzerCmdApp.plot_parser.add_subparsers(dest="plot_type",
            title="Subparsers", help='sub-command help',)
        subparsers.required = True  # type: ignore

        def register_plots(ext, subparsers):
            """Adds a parser per plot"""
            # check if dat is loaded
            parser = ext.obj.default_parser()
            assert parser, "Forgot to return parser"
            subparsers.add_parser(ext.name, parents=[parser], add_help=False)

        self.plot_mgr.map(register_plots, subparsers)
        # # will raise NoMatches when no plot available

        # if loading commands from a file, we disable prompt not to pollute output
        if stdin != sys.stdin:
            log.info("Disabling prompt because reading from stdin")
            self.use_rawinput = False
            self.prompt = ""
            self.intro = ""

        """
        The optional arguments stdin and stdout specify the input and
        output file objects that the Cmd instance or subclass instance will
        use for input and output. If not specified, they will default to
        sys.stdin and sys.stdout.
        """
        print("WARNING: mptcpanalyzer may require a custom wireshark. "
            "Check github for mptcp patches streaming.")
Пример #5
0
class MpTcpAnalyzerCmdApp(cmd2.Cmd):
    """
    mptcpanalyzer can run into 3 modes:

    #. interactive mode (default): an interpreter with some basic completion will accept your commands.
    There is also some help embedded.
    #. if a filename is passed as argument, it will load commands from
    this file otherwise, it will consider the unknow arguments as one command,
     the same that could be used interactively
    """

    intro = textwrap.dedent("""
        Press ? to list the available commands and `help <command>` or `<command> -h`
        for a detailed help of the command
        """.format(__version__))

    def stevedore_error_handler(manager, entrypoint, exception):
        print("Error while loading entrypoint [%s]" % entrypoint)

    def __init__(self, cfg: MpTcpAnalyzerConfig, stdin=sys.stdin, **kwargs) -> None:
        """
        Args:
            cfg (MpTcpAnalyzerConfig): A valid configuration

        Attributes:
            prompt (str): Prompt seen by the user, displays currently loaded pcpa
            config: configution to get user parameters
            data:  dataframe currently in use
        """

        self.shortcuts.update({
            'lc': 'list_connections',
            'ls': 'list_subflows',
            'lr': 'list_reinjections'
        })
        super().__init__(completekey='tab', stdin=stdin)
        self.prompt = FG_COLORS['blue'] + "Ready>"  + color_off
        self.data = None  # type: pd.DataFrame
        self.config = cfg
        self.tshark_config = TsharkConfig(
            delimiter=cfg["mptcpanalyzer"]["delimiter"],
            profile=cfg["mptcpanalyzer"]["wireshark_profile"],
        )

        # cmd2 specific initialization
        self.abbrev = True  # when no ambiguities, run the command
        self.allow_cli_args = True  # disable autoload of transcripts
        self.allow_redirection = True  # allow pipes in commands
        self.default_to_shell = False
        self.debug = True  # for now
        self.set_posix_shlex = True  # need cmd2 >= 0.8

        #  Load Plots
        ######################
        # you can  list available plots under the namespace
        # https://pypi.python.org/pypi/entry_point_inspector
        # https://docs.openstack.org/stevedore/latest/reference/index.html#stevedore.extension.ExtensionManager
        # mgr = driver.DriverManager(
        self.plot_mgr = extension.ExtensionManager(
            namespace='mptcpanalyzer.plots',
            invoke_on_load=True,
            verify_requirements=True,
            invoke_args=(self.tshark_config,),
            # invoke_kwds
            propagate_map_exceptions=True,
            on_load_failure_callback=self.stevedore_error_handler
        )

        self.cmd_mgr = extension.ExtensionManager(
            namespace='mptcpanalyzer.cmds',
            invoke_on_load=True,
            verify_requirements=True,
            invoke_args=(),
            propagate_map_exceptions=False,
            on_load_failure_callback=self.stevedore_error_handler
        )

        #  do_plot parser
        ######################
        # not my first choice but to accomodate cmd2 constraints
        # see https://github.com/python-cmd2/cmd2/issues/498
        subparsers = MpTcpAnalyzerCmdApp.plot_parser.add_subparsers(dest="plot_type",
            title="Subparsers", help='sub-command help',)
        subparsers.required = True  # type: ignore

        def register_plots(ext, subparsers):
            """Adds a parser per plot"""
            # check if dat is loaded
            parser = ext.obj.default_parser()
            assert parser, "Forgot to return parser"
            subparsers.add_parser(ext.name, parents=[parser], add_help=False)

        self.plot_mgr.map(register_plots, subparsers)
        # # will raise NoMatches when no plot available

        # if loading commands from a file, we disable prompt not to pollute output
        if stdin != sys.stdin:
            log.info("Disabling prompt because reading from stdin")
            self.use_rawinput = False
            self.prompt = ""
            self.intro = ""

        """
        The optional arguments stdin and stdout specify the input and
        output file objects that the Cmd instance or subclass instance will
        use for input and output. If not specified, they will default to
        sys.stdin and sys.stdout.
        """
        print("WARNING: mptcpanalyzer may require a custom wireshark. "
            "Check github for mptcp patches streaming.")

    @property
    def plot_manager(self):
        return self.plot_mgr

    @plot_manager.setter
    def plot_manager(self, mgr):
        """
        Override the default plot manager, only used for testing
        :param mgr: a stevedore plugin manager
        """
        self.plot_mgr = mgr

    def load_plugins(self, mgr=None):
        """
        This function monkey patches the class to inject Command plugins

        Attrs:
            mgr: override the default plugin manager when set.

        Useful to run tests
        """
        mgr = mgr if mgr is not None else self.cmd_mgr

        def _inject_cmd(ext, data):
            log.debug("Injecting plugin %s" % ext.name)
            for prefix in ["do", "help", "complete"]:
                method_name = prefix + "_" + ext.name
                try:
                    obj = getattr(ext.obj, prefix)
                    if obj:
                        setattr(MpTcpAnalyzerCmdApp, method_name, obj)
                except AttributeError:
                    log.debug("Plugin does not provide %s" % method_name)

        # there is also map_method available
        try:
            mgr.map(_inject_cmd, self)
        except stevedore.exception.NoMatches as e:
            log.error("stevedore: No matches (%s)" % e)

    def precmd(self, line):
        """
        Here we can preprocess line, with for instance shlex.split() ?
        Note:
            This is only called when using cmdloop, not with onecmd !
        """
        # default behavior
        print(">>> %s" % line)
        return line

    def cmdloop(self, intro=None):
        """
        overrides baseclass just to be able to catch exceptions
        """
        try:
            super().cmdloop()
        except KeyboardInterrupt as e:
            pass

        # Exception raised by sys.exit(), which is called by argparse
        # we don't want the program to finish just when there is an input error
        except SystemExit as e:
            self.cmdloop()
        except mp.MpTcpException as e:
            print(e)
            self.cmdloop()
        except Exception as e:
            log.critical("Unknown error, aborting...")
            log.critical("%s" % e)
            print("Displaying backtrace:\n")
            traceback.print_exc()

    def postcmd(self, stop, line):
        """
        Override baseclass
        returning true will stop the program
        """
        log.debug("postcmd result for line [%s] => %r", line, stop)

        return True if stop is True else False

    parser = MpTcpAnalyzerParser(description="List subflows of an MPTCP connection")
    filter_stream = parser.add_argument("mptcpstream", action="store", type=int,
        help="Equivalent to wireshark mptcp.stream id")
    # TODO for tests only, fix
    setattr(filter_stream, argparse_completer.ACTION_ARG_CHOICES, [0, 1, 2])

    @with_argparser(parser)
    @with_category(CAT_MPTCP)
    @is_loaded
    def do_list_subflows(self, args):
        """
        list mptcp subflows
                [mptcp.stream id]

        Example:
            ls 0
        """
        self.list_subflows(args.mptcpstream)

    @is_loaded
    def list_subflows(self, mptcpstreamid: int):

        try:
            con = MpTcpConnection.build_from_dataframe(self.data, mptcpstreamid)
            self.poutput("mptcp.stream %d has %d subflow(s) (client/server): " % (mptcpstreamid, len(con.subflows())))
            for sf in con.subflows():
                self.poutput("\t%s" % sf)
        except mp.MpTcpException as e:
            self.perror(e)

    # def help_list_subflows(self):
    #     print("Use parser -h")

    # def complete_list_subflows(self, text, line, begidx, endidx):
    #     """ help to complete the args """
    #     # conversion to set removes duplicate keys
    #     l = list(set(self.data["mptcpstream"]))
    #     # convert items to str else it won't be used for completion
    #     l = [str(x) for x in l]

    #     return l




    # parser = gen_pcap_parser({"pcap": PreprocessingActions.FilterStream | PreprocessingActions.Merge }, protocol="tcp")
    parser = argparse_completer.ACArgumentParser(
    description='''
        This function tries to map a tcp.stream id from one pcap
        to one in another pcap in another dataframe.
    '''
    )

    # TODO could use LoadSinglePcap
    load_pcap1 = parser.add_argument("pcap1", action="store", help="first to load")
    load_pcap2 = parser.add_argument("pcap2", action="store", help="second pcap")

    # cmd2.Cmd.path_complete ?
    # setattr(action_stream, argparse_completer.ACTION_ARG_CHOICES, range(0, 10))
    # use path_filter
    setattr(load_pcap1, argparse_completer.ACTION_ARG_CHOICES, ('path_complete', ))
    setattr(load_pcap2, argparse_completer.ACTION_ARG_CHOICES, ('path_complete', ))

    parser.add_argument("tcpstreamid", action="store", type=int,
        help="tcp.stream id visible in wireshark for pcap1")
    parser.add_argument("--json", action="store_true", default=False,
        help="Machine readable summary.")
    parser.add_argument( '-v', '--verbose', dest="verbose", default=False, action="store_true",
        help="how to display each connection")

    parser.epilog = '''
    Examples:
        map_tcp_connection examples/client_1_tcp_only.pcap examples/server_1_tcp_only.pcap  0
    '''

    @with_argparser(parser)
    @with_category(CAT_TCP)
    def do_map_tcp_connection(self, args):

        df1 = load_into_pandas(args.pcap1, self.tshark_config)
        df2 = load_into_pandas(args.pcap2, self.tshark_config)

        main_connection = TcpConnection.build_from_dataframe(df1, args.tcpstreamid)

        mappings = map_tcp_stream(df2, main_connection)

        self.poutput("Trying to map %s" % (main_connection,))
        self.poutput("%d mapping(s) found" % len(mappings))

        for match in mappings:

            # formatted_output = main.format_mapping(match)
            # output = "{c1.tcpstreamid} <-> {c2.tcpstreamid} with score={score}"
            # formatted_output = output.format(
            #     c1=main_connection,
            #     c2=match,
            #     score=score
            # )
            # print(formatted_output)
            self.poutput("%s" % str(match))



    parser = MpTcpAnalyzerParser(
        description="This function tries to map a mptcp.stream from a dataframe"
                    "(aka pcap) to mptcp.stream"
                    "in another dataframe. "
    )

    load_pcap1 = parser.add_argument("pcap1", action="store", type=str, help="first to load")
    load_pcap2 = parser.add_argument("pcap2", action="store", type=str, help="second pcap")

    setattr(load_pcap1, argparse_completer.ACTION_ARG_CHOICES, ('path_complete', ))
    setattr(load_pcap2, argparse_completer.ACTION_ARG_CHOICES, ('path_complete', ))
    parser.add_argument("mptcpstreamid", action="store", type=int, help="to filter")
    parser.add_argument("--trim", action="store", type=float, default=0, 
            help="Remove mappings with a score below this threshold")
    parser.add_argument("--limit", action="store", type=int, default=2,
            help="Limit display to the --limit best mappings")
    parser.add_argument( '-v', '--verbose', dest="verbose", default=False, action="store_true",
        help="display all candidates")

    @with_argparser(parser)
    @with_category(CAT_MPTCP)
    @experimental
    def do_map_mptcp_connection(self, args):
        """
        Tries to map mptcp.streams from different pcaps.
        Score based mechanism

        Todo:
            - Limit number of displayed matches
        """

        df1 = load_into_pandas(args.pcap1, self.tshark_config)
        df2 = load_into_pandas(args.pcap2, self.tshark_config)


        main_connection = MpTcpConnection.build_from_dataframe(df1, args.mptcpstreamid)
        mappings = map_mptcp_connection(df2, main_connection)


        self.poutput("%d mapping(s) found" % len(mappings))
        mappings.sort(key=lambda x: x.score, reverse=True)

        for rank, match in enumerate(mappings):

            if rank >= args.limit:
                self.pfeedback("ignoring mappings left")
                break

            winner_like = match.score == float('inf')

            output = "{c1.mptcpstreamid} <-> {c2.mptcpstreamid} with score={score} {extra}"
            formatted_output = output.format(
                c1=main_connection,
                c2=match.mapped,
                score=FG_COLORS['red'] + str(match.score) + color_off,
                extra= " <-- should be a correct match" if winner_like else ""
            )

            if match.score < args.trim:
                continue

            # match = MpTcpMapping(match.mapped, match.score, mapped_subflows)
            def _print_subflow(x):
                return "\n-" + x[0].format_mapping(x[1])
                
            
            formatted_output += ''.join( [ _print_subflow(x) for x in match.subflow_mappings])

            self.poutput(formatted_output)


    # def parser_summary():
    #     """ """
    #     pass

    summary_parser = MpTcpAnalyzerParser(description="Prints a summary of the mptcp connection")
    action_stream = summary_parser.add_argument(
        "mptcpstream", type=MpTcpStreamId, action=mp.parser.retain_stream("pcap"),
        help="mptcp.stream id")
    # TODO update the stream id autcompletion dynamically ?
    # setattr(action_stream, argparse_completer.ACTION_ARG_CHOICES, range(0, 10))

    summary_parser.add_argument(
        'destination',
        # mp.DestinationChoice,
        action="store", choices=mp.DestinationChoice, type=lambda x: mp.ConnectionRoles[x],
        help='Filter flows according to their direction'
        '(towards the client or the server)'
        'Depends on mptcpstream'
    )
    summary_parser.add_argument("--json", action="store_true", default=False,
        help="Machine readable summary.")
    @with_argparser_test(summary_parser, preload_pcap=True)
    @is_loaded
    def do_summary(self, args, unknown):
        """
        Naive summary contributions of the mptcp connection
        See summary_extended for more details
        """

        df = self.data

        # myNs = Namespace()
        # myNs._dataframes = { "pcap": self.data }
        # args = parser.parse_args(args, myNs)
        mptcpstream = args.mptcpstream

        success, ret = stats.mptcp_compute_throughput(
            self.data, args.mptcpstream, args.destination
        )
        if success is not True:
            self.perror("Throughput computation failed:")
            self.perror(ret)
            return

        if args.json:
            import json
            # TODO use self.poutput
            # or use a stream, it must just be testable
            val = json.dumps(ret, ensure_ascii=False)
            self.poutput(val)
            return

        mptcp_transferred = ret["mptcp_throughput_bytes"]
        self.poutput("mptcpstream %d transferred %d bytes." % (ret["mptcpstreamid"], mptcp_transferred))
        for tcpstream, sf_bytes in map(lambda sf: (sf["tcpstreamid"], sf["throughput_bytes"]), ret["subflow_stats"]):
            subflow_load = sf_bytes/mptcp_transferred
            self.poutput("tcpstream {} transferred {sf_tput} bytes out of {mptcp_tput}, "
                    "accounting for {tput_ratio:.2f}%".format(
                tcpstream, sf_tput=sf_bytes, mptcp_tput=mptcp_transferred, 
                tput_ratio=subflow_load*100
            ))


    parser = gen_pcap_parser({"pcap": PreprocessingActions.Preload})
    parser.description = "Export connection(s) to CSV"
    parser.epilog = '''

    '''
    # faut qu'il prenne le pcap ici sinon je ne peux pas autofiltrer :
    parser.add_argument("output", action="store", help="Output filename")

    group = parser.add_mutually_exclusive_group(required=False)
    group.add_argument('--tcpstream', action=functools.partial(FilterStream, "pcap", False), type=TcpStreamId)
    group.add_argument('--mptcpstream', action=functools.partial(FilterStream, "pcap", True), type=MpTcpStreamId)
    # parser.add_argument("protocol", action="store", choices=["mptcp", "tcp"], help="tcp.stream id visible in wireshark")
    # TODO check ?
    parser.add_argument("--destination", action="store", 
        choices=mp.DestinationChoice,
        help="tcp.stream id visible in wireshark")
    parser.add_argument("--drop-syn", action="store_true", default=False,
        help="Helper just for my very own specific usecase")
    @is_loaded
    @with_argparser(parser)
    def do_tocsv(self, args):
        """
        Selects tcp/mptcp/udp connection and exports it to csv
        """

        df = self.data
        # TODO let the parser do it
        # if args.tcpstream:
        #     # df = df[ df.tcpstream == args.tcpstream]

        #     self.poutput("Filtering tcpstream")
        #     con = TcpConnection.build_from_dataframe(df, args.tcpstream)
        #     if args.destination:
        #         self.poutput("Filtering destination")
        #         q = con.generate_direction_query(args.destination)
        #         df = df.query(q)

        # elif args.mptcpstream:
        #     self.poutput("Unsupported yet")
            # df = df[ df.mptcpstream == args.mptcpstream]

        # need to compute the destinations before dropping syn from the dataframe
        # df['tcpdest'] = np.nan;
        for streamid, subdf in df.groupby("tcpstream"):
            con = TcpConnection.build_from_dataframe(df, streamid)
            df = mpdata.tcpdest_from_connections(df, con)

            if args.drop_syn:
                # use subdf ?
                self.poutput("drop-syn Unsupported yet")
                df.drop(subdf.head(3).index, inplace=True)
                # drop 3 first packets of each connection ?
                # this should be a filter
                syns = df[df.tcpflags == mp.TcpFlags.SYN]
        #     df = df[ df.flags ]
        # if args.destination:
        #     if args.tcpstream:
                # TODO we should filter destination
        self.poutput("Writing to %s" % args.output)
        pandas_to_csv(df, args.output)


    parser = gen_bicap_parser("mptcp", True)
    parser.add_argument("--json", action="store_true", default=False,
        help="Machine readable summary.")
    parser.description = """
        Look into more details of an mptcp connection
        """
    parser.epilog = """
        summary_extended examples/client_2_redundant.pcapng 0 examples/server_2_redundant.pcapng 0
        """
    @with_argparser(parser)
    def do_summary_extended(self, args):
        """
        Summarize contributions of each subflow
        For now it is naive, does not look at retransmissions ?
        """

        print("%r" % args)
        df_pcap1 = load_into_pandas(args.pcap1, self.tshark_config)

        destinations = args.destinations
        # or list(mp.ConnectionRoles)

        for destination in destinations:
            success, basic_stats = stats.mptcp_compute_throughput(
                # TODO here we should load the pcap before hand !
                df_pcap1,
                args.pcap1stream,
                args.destinations
            )
            if success is not True:
                self.perror("Error %s" % basic_stats)

            # TODO already be done
            # TODO we should have the parser do it
            df = load_merged_streams_into_pandas(
                args.pcap1,
                args.pcap2,
                args.pcap1stream,
                args.pcap2stream,
                True,
                self.tshark_config
            )

            success, ret = stats.mptcp_compute_throughput_extended(
                df,
                stats=basic_stats,
                destination=destination
            )

            if success is not True:
                self.perror("Throughput computation failed:")
                self.perror(ret)
                return

            if args.json:
                import json
                # TODO use self.poutput
                # or use a stream, it must just be testable
                val = json.dumps(ret, ensure_ascii=False)
                self.poutput(val)
                return


            # TODO display goodput/ratio
            total_transferred = ret["mptcp_throughput_bytes"]
            #  (ret["mptcpstreamid"], ret["mptcp_bytes"]))
            msg = "mptcpstream {mptcpstreamid} throughput/goodput {mptcp_throughput_bytes}/{mptcp_goodput_bytes}"
            self.poutput(msg.format(**ret))
            for sf in ret["subflow_stats"]:

                subflow_load = sf_bytes/ret["mptcp_bytes"]
                msg = """
                tcpstream {tcpstreamid} analysis:
                - throughput: transferred {} out of {mptcp_throughput_bytes}, accounting for {.2f:throughput_contribution}%
                - goodput: transferred {mptcp_goodput} out of {mptcp_goodput_bytes}, accounting for {.2f:goodput_contribution}%
                """
                
                self.poutput(
                    msg.format(
                    mptcp_tput=ret["mptcp_throughput_bytes"],
                    **ret,
                    **sf
                ))

    # 
    @is_loaded
    @with_category(CAT_TCP)
    def do_list_tcp_connections(self, *args):
        """
        List tcp connections via their ids (tcp.stream)
        """
        streams = self.data.groupby("tcpstream")
        self.poutput('%d tcp connection(s)' % len(streams))
        for tcpstream, group in streams:
            # self.list_subflows(mptcpstream)
            self.data.tcp.connection(tcpstream)
            con = TcpConnection.build_from_dataframe(self.data, tcpstream)
            self.poutput(con)
            self.poutput("\n")


    @is_loaded
    @with_category(CAT_MPTCP)
    def do_list_mptcp_connections(self, *args):
        """
        List mptcp connections via their ids (mptcp.stream)
        """
        streams = self.data.groupby("mptcpstream")
        self.poutput('%d mptcp connection(s)' % len(streams))
        for mptcpstream, group in streams:
            self.list_subflows(mptcpstream)
            self.poutput("\n")

    # def generate_namespace(self) -> argparse.Namespace:
    #     myNamespace = Namespace()
    #     myNamespace.toto = self.data
    #     parser = argparse_completer.ACArgumentParser(
    #         description="""
    #         Mptcpanalyzer filters pcaps to keep only tcp packets.
    #         This may explain why printed packet ids dont map
    #         """
    #     )

    load_pcap1 = parser.add_argument("imported_pcap", type=str, help="Capture file to cleanup.")
    setattr(load_pcap1, argparse_completer.ACTION_ARG_CHOICES, ('path_complete', ))
    parser.add_argument("exported_pcap", type=str, help="Cleaned up file")

    @with_argparser(parser)
    def do_clean_pcap(self, args):
        """
        toto
        """
        self.poutput("Exporting a clean version of {} in {}".format(
            args.imported_pcap, args.exported_pcap))

        self.tshark_config.filter_pcap(args.imported_pcap, args.exported_pcap)


    # TODO it should be able to print for both 
    parser = gen_bicap_parser("tcp", True)
    parser.description = """This function tries merges a tcp stream from 2 pcaps
                        in an attempt to print owds. See map_tcp_connection first maybe."""

    # TODO add a limit of packets or use ppaged()
    # parser.add_argument("protocol", action="store", choices=["mptcp", "tcp"],
    #     help="tcp.stream id visible in wireshark")
    # give a choice "hash" / "stochastic"
    parser.add_argument(
        '-v', '--verbose', dest="verbose", default=False,
        action="store_true",
        help="how to display each connection"
    )
    parser.add_argument("--csv", action="store", default=None,
        help="Machine readable summary.")
    parser.epilog = '''
    You can run for example:
        map_tcp_connection examples/client_1_tcp_only.pcap examples/server_1_tcp_only.pcap  0
    '''
    @with_argparser(parser)
    @experimental
    def do_print_owds(self, args):
        """
        TODO options to diagnose errors:
        - print unmapped packets
        - print abnormal OWDs (negative etc)
        """

        self.poutput("Loading merged streams")
        df = args._dataframes["pcap"]
        result = df
        print(result.head(10))
        # print("%r" % result)
        # print(result[mpdata.TCP_DEBUG_FIELDS].head(20))
        # for key, subdf in df.groupby(_sender("tcpdest"))

        # todo sort by chronological order ?
        # for row in df.itertuples();
            # self.ppaged()

        if args.csv:
            self.pfeedback("Exporting to csv")
            with open(args.csv, "w") as fd:
                df.to_csv(
                    fd,
                    sep="|",
                    index=False,
                    header=True,
                )

        # print unmapped packets
        print("print_owds finished")
        # print("TODO display before doing plots")
        # TODO display errors
        print(result[["owd"]].head(20))
        # print(result.columns)
        mpdata.print_weird_owds(result)
        # print(result[["owd"]].head(20))

    def do_check_tshark(self, line):
        """
        Check your tshark/wireshark version
        """
        self.poutput("TODO implement automated check")
        self.poutput("you need a wireshark > 19 June 2018 with commit dac91db65e756a3198616da8cca11d66a5db6db7...")


    parser = gen_bicap_parser("mptcp", dest=True)
    parser.description = """
        Qualify reinjections of the connection.
        You might want to run map_mptcp_connection first to find out
        what map to which
        """
    parser.add_argument("--failed", action="store_true", default=False,
        help="List failed reinjections too.")
    parser.add_argument("--csv", action="store_true", default=False,
        help="Machine readable summary.")
    parser.add_argument("--debug", action="store_true", default=False,
        help="Explain decision for every reinjection.")

    @with_argparser_and_unknown_args(parser)
    @with_category(CAT_MPTCP)
    @experimental
    def do_qualify_reinjections(self, args, unknown):
        """
        test with:
            mp qualify_reinjections 0

        TODO move the code into a proper function
        """
        # TODO this should be done automatically right ?
        df_all = load_merged_streams_into_pandas(
            args.pcap1,
            args.pcap2,
            args.pcap1stream,
            args.pcap2stream,
            mptcp=True,
            tshark_config=self.tshark_config
        )

        # adds a redundant column
        df = classify_reinjections(df_all)

        # print(df_all[ pd.notnull(df_all[_sender("reinjection_of")])] [
        #     _sender(["reinjection_of", "reinjected_in", "packetid", "reltime"]) +
        #     _receiver(["packetid", "reltime"])
        # ])

        # to help debug
        # df.to_excel("temp.xls")

        def _print_reinjection_comparison(original_packet, reinj, ):
            """
            Expects tuples of original and reinjection packets
            """
            # original_packet  = sender_df.loc[ sender_df.packetid == initial_packetid, ].iloc[0]
            row = reinj

            reinjection_packetid = getattr(row, _sender("packetid")),
            reinjection_start    = getattr(row, _sender("abstime")),
            reinjection_arrival  = getattr(row, _receiver("abstime")),
            original_start       = original_packet[_sender("abstime")],
            original_arrival     = original_packet[_receiver("abstime")] 

            if reinj.redundant == False:
                # print(original_packet["packetid"])
                msg = ("packet {pktid} is a successful reinjection of {initial_packetid}."
                        " It arrived at {reinjection_arrival} to compare with {original_arrival}"
                        " while being transmitted at {reinjection_start} to compare with "
                        "{original_start}, i.e., {reinj_delta} before")
                # TODO use assert instead
                if getattr(row, _receiver("abstime")) > original_packet[ _receiver("abstime") ]:
                    print("BUG: this is not a valid reinjection after all ?")

            elif args.failed:
                # only de
                msg = "packet {pktid} is a failed reinjection of {initial_packetid}."
            else:
                return

            msg = msg.format(
                pktid               = reinjection_packetid,
                initial_packetid    = initial_packetid,

                reinjection_start   = reinjection_start,
                reinjection_arrival = reinjection_arrival,
                original_start      = original_start,
                original_arrival    = original_arrival,
                reinj_delta         = reinj.reinj_delta,
            )
            self.poutput(msg)


        # with pd.option_context('display.max_rows', None, 'display.max_columns', 300):
        #     print(reinjected_packets[["packetid", "packetid_receiver", *_receiver(["reinjected_in", "reinjection_of"])]].head())
        # TODO filter depending on --failed and --destinations

        if args.csv:
            self.pfeedback("Exporting to csv")
            # keep redundant
            # only export a subset ?
            # for 
            # df1 = df[['a','d']]
            # smalldf = df.drop()
            columns = _sender(["abstime", "reinjection_of", "reinjected_in", "packetid", "tcpstream", "mptcpstream", "tcpdest", "mptcpdest"])
            columns += _receiver(["abstime", "packetid"])
            columns += ["redundant", "owd", "reinj_delta"]

            df[columns].to_csv(
                self.stdout,
                sep="|",
                index=False,
                header=True,
            )
            return

        for destination in ConnectionRoles:

            if args.destinations and destination not in args.destinations:
                log.debug("ignoring destination %s " % destination)
                continue

            self.poutput("looking for reinjections towards mptcp %s" % destination)
            sender_df = df[df.mptcpdest == destination]
            log.debug("%d reinjections in that direction" % (len(sender_df), ))

            # TODO we now need to display successful reinjections
            reinjections = sender_df[pd.notnull(sender_df[_sender("reinjection_of")])]

            successful_reinjections = reinjections[reinjections.redundant == False]

            self.poutput("%d successful reinjections" % len(successful_reinjections))
            # print(successful_reinjections[ _sender(["packetid", "reinjection_of"]) + _receiver(["packetid"]) ])

            for row in reinjections.itertuples(index=False):

                # loc ? this is an array, sort it and take the first one ?
                initial_packetid = row.reinjection_of[0]
                # print("initial_packetid = %r %s" % (initial_packetid, type(initial_packetid)))

                original_packet  = df_all.loc[df_all.packetid == initial_packetid].iloc[0]
                # print("original packet = %r %s" % (original_packet, type(original_packet)))

                # if row.redundant == True and args.failed:
                    # _print_failed_reinjection(original_packet, row, debug=args.debug)

                _print_reinjection_comparison(original_packet, row, )

                
    parser = MpTcpAnalyzerParser(
        description="Listing reinjections of the connection"
    )
    parser.add_argument("mptcpstream", type=MpTcpStreamId, help="mptcp.stream id")
    parser.add_argument("--summary", action="store_true", default=False,
            help="Just count reinjections")

    @is_loaded
    @with_category(CAT_MPTCP)
    @with_argparser_test(parser)
    def do_list_reinjections(self, args):
        """
        List reinjections
        We want to be able to distinguish between good and bad reinjections
        (like good and bad RTOs).
        A good reinjection is a reinjection for which either:
        - the segment arrives first at the receiver
        - the cumulative DACK arrives at the sender sooner thanks to that reinjection

        To do that, we need to take into account latencies

        """

        df = self.data
        df = self.data[df.mptcpstream == args.mptcpstream]
        if df.empty:
            self.poutput("No packet with mptcp.stream == %d" % args.mptcpstream)
            return

        # known : Set[int] = set()
        # print(df.columns)

        # TODO move to outer function ?
        # TODO use ppaged
        reinjections = df.dropna(axis=0, subset=["reinjection_of"] )
        total_nb_reinjections = 0
        output = ""
        for row in reinjections.itertuples():
            # if row.packetid not in known:
            # ','.join(map(str,row.reinjection_of)
            output += ("packetid=%d (tcp.stream %d) is a reinjection of %d packet(s): " %
                (row.packetid, row.tcpstream, len(row.reinjection_of)))

            # print("reinjOf=", row.reinjection_of)
            # assuming packetid is the index
            for pktId in row.reinjection_of:
                # print("packetId %d" % pktId)
                # entry = self.data.iloc[ pktId - 1]
                entry = self.data.loc[ pktId ]
                # entry = df.loc[ df.packetid == pktId]
                # print("packetId %r" % entry)
                output += ("- packet %d (tcp.stream %d)" % (entry.packetid, entry.tcpstream))
            # known.update([row.packetid] + row.reinjection)

        self.ppaged(output)
        # reinjections = df["reinjection_of"].dropna(axis=0, )
        # print("number of reinjections of ")


    parser = MpTcpAnalyzerParser(
        description="Loads a pcap to analyze"
    )
    parser.add_argument("input_file", action=LoadSinglePcap, 
        help="Either a pcap or a csv file."
        "When a pcap is passed, mptcpanalyzer looks for a cached csv"
        "else it generates a "
        "csv from the pcap with the external tshark program.")
    @with_argparser(parser)
    def do_load_pcap(self, args):
        """
        Load the file as the current one
        """
        print(args)
        # args = shlex.split(args)
        # print(args)
        # parser = self.do_load_pcap.argparser
        # print(parser)
        # args = parser.parse_args(args)
        
        self.poutput("Loading %s" % args.input_file)
        self.data = args._dataframes["input_file"]
        self.prompt = "%s> " % os.path.basename(args.input_file)

    def do_list_available_plots(self, args):
        """
        Print available plots. Mostly for debug, you should use 'plot'.
        """
        plot_names = self.list_available_plots()
        print(plot_names)

    def list_available_plots(self):
        return self.plot_mgr.names()

    def pcap_loaded(self):
        return isinstance(self.data, pd.DataFrame)

    plot_parser = MpTcpAnalyzerParser(prog='plot', description='Generate plots')
    # TODO complete the help
    # plot throughput tcp examples/client_2_redundant.pcapng 0 examples/server_2_redundant.pcapng 0 3" "quit"
    plot_parser.epilog = '''
        You can run for example:
            plot owd tcp examples/client_2_filtered.pcapng 0 examples/server_2_filtered.pcapng 0 --display
    '''

    @with_argparser_and_unknown_args(plot_parser)
    def do_plot(self, args, unknown):
        """
        global member used by others do_plot members *
        Loads required dataframes when necessary
        """

        # Allocate plot object
        plotter = self.plot_mgr[args.plot_type].obj

        # TODO reparse with the definitive parser ?

        # 'converts' the namespace to for the syntax define a dict
        dargs = vars(args)

        print("%s" % dargs)
        dataframes = dargs.pop("_dataframes")
        # workaround argparse limitations to set as default both directions
        # TODO replace that with an action ?
        # destinations=dargs.get("destinations", list(mp.ConnectionRoles))
        # dargs.update(destinations=destinations)
        # log.debug("Selecting destinations %s" % (destinations,))
        # dataframes = plotter.preprocess(**dargs)
        print("%s" % args)
        # dataframes = args._dataframes.values()
        assert dataframes is not None, "Preprocess must return a list"
        # pass unknown_args too ?
        result = plotter.run(**dataframes, **dargs)

        # to save to file for instance
        plotter.postprocess(result, **dargs)
 
    @with_category(CAT_GENERAL)
    def do_clean_cache(self, line):
        """
        mptcpanalyzer saves pcap to csv converted files in a cache folder, (most likely
        $XDG_CACHE_HOME/mptcpanalyzer). This commands clears the cache.
        """
        cache = mp.get_cache()
        self.poutput("Cleaning cache [%s]" % cache.folder)
        cache.clean()

    def do_dump(self, args):
        """
        Dumps content of the csv file, with columns selected by the user.
        Mostly used for debug
        """
        parser = argparse.ArgumentParser(description="dumps csv content")
        parser.add_argument('columns', default=[
                            "ipsrc", "ipdst"], choices=self.data.columns, nargs="*")

        parser.add_argument('-n', default=10, action="store",
                help="Number of results to display")
        args = parser.parse_args(shlex.split(args))
        print(self.data[args.columns])

    def complete_dump(self, text, line, begidx, endidx):
        """
        Should return a list of possibilities
        """
        l = [x for x in self.data.columns if x.startswith(text)]
        return l

    # not needed in cmd2
    def do_quit(self, *args):
        """
        Quit/exit program
        """
        print("Thanks for flying with mptcpanalyzer.")
        return True

    def do_EOF(self, line):
        """
        Keep it to be able to exit with CTRL+D
        """
        return True

    def preloop(self):
        """
        Executed once when cmdloop is called
        """
        histfile = self.config["mptcpanalyzer"]['history']
        if readline and os.path.exists(histfile):
            log.debug("Loading history from %s" % histfile)
            readline.read_history_file(histfile)

    def postloop(self):
        histfile = self.config["mptcpanalyzer"]['history']
        if readline:
            log.debug("Saving history to %s" % histfile)
            readline.set_history_length(histfile_size)
            readline.write_history_file(histfile)
Пример #6
0
def load_into_pandas(
        input_file: str,
        config: TsharkConfig,
        # clock_offset: int = 0,
        **extra) -> pd.DataFrame:
    """
    load mptcp  data into pandas

    Args:
        input_file: pcap filename
        config: Hard, keep changing
        load_cb: callback to use if cache not available
        extra: extra arguments to forward to load_cb
    """
    log.debug("Asked to load simple pcap %s" % input_file)

    filename = getrealpath(input_file)
    cache = mp.get_cache()

    tshark_dtypes = {
        fullname: field.type
        for fullname, field in config.fields.items()
    }

    artifical_dtypes = {
        name: field.type
        for name, field in per_pcap_artificial_fields.items()
    }
    dtypes = dict(tshark_dtypes, **artifical_dtypes)

    # TODO add per_pcap_artificial_fields hash
    pseudohash = hash(config) + hash(frozenset(dtypes.items()))
    uid = cache.cacheuid(
        '',  # prefix (might want to shorten it a bit)
        [filename],  # dependencies
        str(pseudohash) + '.csv')

    is_cache_valid, csv_filename = cache.get(uid)

    logging.debug("cache validity=%d cachename: %s" %
                  (is_cache_valid, csv_filename))
    if not is_cache_valid:
        logging.info("Cache invalid .. Converting %s " % (filename, ))

        with tempfile.NamedTemporaryFile(mode='w+',
                                         prefix="mptcpanalyzer-",
                                         delete=False) as out:
            tshark_fields = [
                field.fullname for _, field in config.fields.items()
            ]
            retcode, stderr = config.export_to_csv(filename, out,
                                                   tshark_fields)
            log.info("exporter exited with code=%d", retcode)
            if retcode is 0:
                out.close()
                cache.put(uid, out.name)
            else:
                raise Exception(stderr)

    log.debug("Loading a csv file %s" % csv_filename)

    try:
        with open(csv_filename) as fd:

            converters = {
                f.fullname: f.converter
                for _, f in config.fields.items() if f.converter
            }
            converters.update({
                name: f.converter
                for name, f in per_pcap_artificial_fields.items()
                if f.converter
            })
            # print("converters\n", converters)

            dtypes = {
                field.fullname: field.type
                for _, field in config.fields.items()
            }
            log.debug("Dtypes before load: %s" % dtypes)
            data = pd.read_csv(
                fd,
                comment='#',
                sep=config.delimiter,
                dtype=dtypes,
                # seems like for now we can't change the default representation apart from converting the column to
                # a string !!!
                # https://stackoverflow.com/questions/46930201/pandas-to-datetime-is-not-formatting-the-datetime-value-in-the-desired-format
                # date_parser=_convert_timestamp,
                # parse_dates=["frame.time_epoch"],
                converters=converters,
                # float_precision="high",  # might be necessary
                # nrows=10, # useful for debugging purpose
            )
            # 1 to 1 -> can't add new columns
            data.rename(inplace=True,
                        columns={
                            f.fullname: name
                            for name, f in config.fields.items()
                        })

            # add new columns
            data = data.assign(
                **{name: np.nan
                   for name in per_pcap_artificial_fields.keys()})
            column_names = set(data.columns)
            # print("column_names", column_names)
            data = data.astype(dtype=artifical_dtypes, copy=False)

            # we want packetid column to survive merges/dataframe transformation so keepit as a column
            # TODO remove ? let other functions do it ?
            data.set_index("packetid", drop=False, inplace=True)
            log.debug("Column names: %s" % data.columns)

            hashing_fields = [
                name for name, field in config.fields.items() if field.hash
            ]
            log.debug("Hashing over fields %s" % hashing_fields)

            # won't work because it passes a Serie (mutable)_
            # TODO generate hashing fields from Fields
            temp = pd.DataFrame(data, columns=hashing_fields)
            data["hash"] = temp.apply(lambda x: hash(tuple(x)), axis=1)

    except Exception as e:
        logging.error(
            "You may need to filter more your pcap to keep only mptcp packets")
        raise e

    log.info("Finished loading dataframe for %s. Size=%d" %
             (input_file, len(data)))

    # print("FINAL_DTYPES")
    log.debug(data.dtypes)
    # print(data.head(5))
    return data
Пример #7
0
def load_into_pandas(input_file: str, config: TsharkConfig,
                     **extra) -> pd.DataFrame:
    """
    load mptcp data into pandas

    Args:
        input_file: pcap filename
        config: Hard, keep changing
        load_cb: callback to use if cache not available
        extra: extra arguments to forward to load_cb
    """
    log.debug("Asked to load simple pcap %s", input_file)

    filename = getrealpath(input_file)
    cache = mp.get_cache()

    # {fullname: field.type for fullname, field in config.fields.items()}
    tshark_dtypes = get_dtypes(config.fields)

    artifical_dtypes = get_dtypes(per_pcap_artificial_fields)
    dtypes = dict(tshark_dtypes, **artifical_dtypes)

    # TODO add per_pcap_artificial_fields hash
    pseudohash = hash(config) + hash(frozenset(dtypes.items()))
    uid = cache.cacheuid(
        '',  # prefix (might want to shorten it a bit)
        [filename],  # dependencies
        str(pseudohash) + '.csv')
    # print(config.fields)

    is_cache_valid, csv_filename = cache.get(uid)

    log.debug("cache validity=%d cachename: %s", is_cache_valid, csv_filename)
    if not is_cache_valid:
        log.info(
            "Cache invalid .. Converting %s",
            filename,
        )

        with tempfile.NamedTemporaryFile(mode='w+',
                                         prefix="mptcpanalyzer-",
                                         delete=False) as out:
            # tshark_fields = [field.fullname for _, field in config.fields.items()]
            tshark_fields = {
                field.fullname: name
                for name, field in config.fields.items()
            }
            retcode, _, stderr = config.export_to_csv(filename, out,
                                                      tshark_fields)
            log.info("exporter exited with code=%d", retcode)
            if retcode is 0:
                out.close()
                cache.put(uid, out.name)
            else:
                raise Exception(stderr)

    log.debug("Loading a csv file %s", csv_filename)

    try:
        with open(csv_filename) as fd:

            # gets a list of fields to convert
            # we dont want to modify the passed parameter
            fields = config.fields.copy()
            fields.update(per_pcap_artificial_fields)
            converters = get_converters(config.fields)

            # builds a list of fields to be parsed as dates
            # (since converter/types don't seem to be great)
            date_cols = get_date_cols(config.fields)

            dtypes = get_dtypes(config.fields)

            log.log(mp.TRACE, "Dtypes before load:\n%s", pp.pformat(dtypes))
            log.log(mp.TRACE, "Converters before load:\n%s",
                    pp.pformat(converters))
            log.log(mp.TRACE, "Fields to load as times:\n%s",
                    pp.pformat(date_cols))

            # keep this commented code to help diagnosing pandas problems
            # from mptcpanalyzer.debug import read_csv_debug
            fields = [f.fullname for _, f in config.fields.items()]
            # fields =[ "tcp.options.mptcp.sendkey" ]
            # data = mptcpanalyzer.debug.read_csv_debug(fields,
            data = pd.read_csv(
                fd,
                comment='#',
                sep=config.delimiter,
                dtype=dtypes,
                date_parser=date_converter,
                parse_dates=date_cols,
                # ideally DON't user converters but pandas bugs...
                converters=converters,
                # float_precision="high",  # might be necessary
            )

            log.debug("Finished loading CSV file")
            # 1 to 1 -> can't add new columns
            data.rename(inplace=True,
                        columns={
                            f.fullname: name
                            for name, f in config.fields.items()
                        })

            # add new columns
            data = data.assign(
                **{name: np.nan
                   for name in per_pcap_artificial_fields.keys()})
            column_names = set(data.columns)
            data = data.astype(dtype=artifical_dtypes, copy=False)

            # we want packetid column to survive merges/dataframe transformation
            # so keepit as a column
            # TODO remove ? let other functions do it ?
            data.set_index("packetid", drop=False, inplace=True)

            hashing_fields = [
                name for name, field in config.fields.items() if field.hash
            ]
            log.debug("Hashing over fields %s", hashing_fields)

            # won't work because it passes a Serie (mutable)_
            # TODO generate hashing fields from Fields
            # TODO reference stack overflow problem
            temp = pd.DataFrame(data, columns=hashing_fields)
            data["hash"] = temp.apply(lambda x: hash(tuple(x)), axis=1)

    except TypeError as e:
        log.error(
            "You may need to filter more your pcap to keep only mptcp packets")
        raise e
    except Exception as e:
        log.error(
            "You may need to filter more your pcap to keep only mptcp packets")
        raise e

    log.info("Finished loading dataframe for %s. Size=%d", input_file,
             len(data))

    return data