Exemplo n.º 1
0
 def _convert_to_enums():
     # per_pcap_artificial_fields
     for col in [
             _first("tcpdest"),
             _first("mptcpdest"),
             _second("tcpdest"),
             _second("mptcpdest")
     ]:
         merged_df[col] = merged_df[col].apply(
             _convert_role, convert_dtype=False)
Exemplo n.º 2
0
            def _gen_dtypes(fields) -> Dict[str, Any]:
                dtypes = {}  # type: ignore
                for _name in [_first, _second]:

                    # TODO this could be simplified
                    for k, v in fields.items():
                        if v is not None or k not in ["tcpflags"]:
                            dtypes.setdefault(_name(k), v)

                    # add generated field dtypes
                    dtypes.update({
                        _name(f.fullname): f.type
                        for f in per_pcap_artificial_fields.values()
                    })

                # these are overrides from the generated dtypes
                dtypes.update({
                    # during the merge, we join even unmapped packets so some entries
                    # may be empty => float64
                    _first("packetid"):
                    np.float64,
                    _second("packetid"):
                    np.float64,
                })

                return dtypes
Exemplo n.º 3
0
def map_tcp_packets_via_hash(
        # TODO rename, these are not host1/host2 anymore
        host1_df,
        host2_df,
        *kargs,
        **kwargs):
    """
    Merge on hash of different fields
    Resulting dataframe has H1_SUFFIX / H2_SUFFIX
    """
    log.info("Merging dataframes via hash")
    debug_cols = ["packetid", "hash", "abstime"]
    # debug_dataframe(total, "concatenated df",
    #     usecols=_first(["abstime", "tcpdest"]) + _second(["abstime", "tcpdest"]))

    debug_dataframe(
        host1_df,
        "host1_df",
    )
    debug_dataframe(host2_df, "host2 df")

    # todo we could now use merge_asof
    # TODO here we should be able to drop some columns in double
    try:
        # first check hashes are identical
        # check hashes are different

        host1_df = deal_with_duplicated_hash(host1_df)
        host2_df = deal_with_duplicated_hash(host2_df)

        res = pd.merge(
            host1_df,
            host2_df,
            on="hash",
            suffixes=(HOST1_SUFFIX, HOST2_SUFFIX),  # columns suffixes
            how="outer",  # we want to keep packets from both
            # we want to know how many packets were not mapped correctly, adds the merge column
            # can take values "left_only"/ "right_only" or both
            indicator="merge_status",
            # run additionnal checks against duplicate hashes
            validate="one_to_one",  # can slow process
        )

    except pd.errors.MergeError as e:

        # TODO we don't want to print here
        print("An error happened during the merge of the 2 pcaps")
        print(e)
        raise e

    # TCP_DEBUG_FIELDS
    TCP_DEBUG_FIELDS = ['packetid', "abstime"]
    debug_cols = _first(TCP_DEBUG_FIELDS) + _second(TCP_DEBUG_FIELDS)
    debug_dataframe(res, "Result of merging by hash", usecols=debug_cols)
    return res
Exemplo n.º 4
0
            def _gen_converters() -> Dict[str, Callable]:

                # converters = {}   # type: Dict[str, Any]
                fields = dict(tshark_config.fields)
                fields.update(per_pcap_artificial_fields)
                converters = {}
                # no need to convert tcpflags
                default_converters = {
                    name: f.converter
                    for name, f in fields.items()
                    if f.converter and name != "tcpflags"
                }
                # converters.update({ name: f.converter for name, f in per_pcap_artificial_fields.items() if f.converter})
                for name, converter in default_converters.items():
                    converters.update({
                        _first(name): converter,
                        _second(name): converter
                    })

                return converters
Exemplo n.º 5
0
def merge_tcp_dataframes_known_streams(
    con1: Tuple[pd.DataFrame, TcpConnection], con2: Tuple[pd.DataFrame,
                                                          TcpConnection]
    # , dest: ConnectionRoles
) -> pd.DataFrame:
    """
    Generates an intermediate file with the owds.

    1/ clean up dataframe to keep
    2/ identify which dataframe is server's/client's
    2/

    Args:
        con1: Tuple dataframe/tcpstream id
        con2: same

    Returns:
        res
        To ease debug we want to see packets in chronological order

    """
    h1_df, main_connection = con1
    h2_df, mapped_connection = con2

    logging.info(
        "Trying to merge connection {} to {} of respective sizes {} and {}".
        format(mapped_connection, main_connection, len(h1_df), len(h2_df)))
    # print(h1_df[["packetid","hash", "reltime"]].head(5))
    # print(h2_df[["packetid","hash", "reltime"]].head(5))

    # cleanup the dataframes to contain only the current stream packets
    h1_df = h1_df[h1_df.tcpstream == main_connection.tcpstreamid]
    h2_df = h2_df[h2_df.tcpstream == mapped_connection.tcpstreamid]

    # TODO reorder columns to have packet ids first !
    total = pd.DataFrame()

    for tcpdest in ConnectionRoles:

        log.debug("Looking at tcpdestination %s" % tcpdest)
        q = main_connection.generate_direction_query(tcpdest)
        h1_unidirectional_df = h1_df.query(q)
        q = mapped_connection.generate_direction_query(tcpdest)
        h2_unidirectional_df = h2_df.query(q)

        res = map_tcp_packets(h1_unidirectional_df, h2_unidirectional_df)

        # pandas trick to avoid losing dtype
        # see https://github.com/pandas-dev/pandas/issues/22361#issuecomment-413147667
        # no need to set _second (as they are just opposite)
        # TODO this should be done somewhere else
        # else summary won't work
        res[_first('tcpdest')][:] = tcpdest
        res[_second('tcpdest')][:] = tcpdest

        # generate_mptcp_direction_query
        if isinstance(main_connection, MpTcpSubflow):

            print("THIS IS A SUBFLOW")
            mptcpdest = main_connection.mptcp_dest_from_tcpdest(tcpdest)
            res[_first('mptcpdest')][:] = mptcpdest
            res[_second('mptcpdest')][:] = mptcpdest

            print("Setting mptcpdest to %s", mptcpdest)
            # if tcpdest == main_connection.mptcpdest

        # TODO here we should
        total = pd.concat([res, total])

    # TODO move elsewhere, to outer function
    log.info(
        "Resulting merged tcp dataframe of size {} ({} mapped packets vs {} unmapped)"
        "with input dataframes of size {} and {}.".format(
            len(total), len(total[total._merge == "both"]),
            len(total[total._merge != "both"]), len(h1_df), len(h2_df)))

    # print("unmapped packets:")
    # print(total.loc[total._merge != "both", _sender(TCP_DEBUG_FIELDS) + _receiver(TCP_DEBUG_FIELDS) ])
    return total
Exemplo n.º 6
0
def convert_to_sender_receiver(
    df
    # def tcp_compute_owd(
    # already merged df
    # con1: Tuple[pd.DataFrame, TcpConnection],
    # con2: Tuple[pd.DataFrame, TcpConnection]
    # tcp_sender_df,
    # tcp_receiver_df
):
    """
    each packet has a destination marker
    Assume clocks are fine here !
    """
    logging.debug("Converting to sender/receiver format")

    total = pd.DataFrame()
    # min_h1 = df.iloc[0, subdf.columns.get_loc(_first('abstime'))]
    # min_h2 = df.iloc[0, subdf.columns.get_loc(_second('abstime'))]

    for tcpstream, subdf in df.groupby(_first("tcpstream")):
        # assume packets are in chronological order, else we would have to use min
        # min_h1 = h1_df['abstime'].min()
        # min_h2 = h2_df['abstime'].min()
        # min_h1 = subdf.loc[0, _first('abstime')]

        min_h1 = subdf.iloc[0, subdf.columns.get_loc(_first('abstime'))]
        min_h2 = subdf.iloc[0, subdf.columns.get_loc(_second('abstime'))]
        # min_h2 = subdf[_second('abstime')][0]
        print("min_h1 = %r" % min_h1)
        print("min_h1 float = %f" % min_h1)

        #         def _rename_columns(h1_role: ConnectionRoles):
        #             """
        # client_suffix, server_suffix
        #             Params:
        #                 client_suffix must be one of HOST1_SUFFIX or HOST2_SUFFIX
        #                 server_suffix can be deduced
        #             """
        def _rename_column(col_name, suffixes) -> str:

            for suffix_to_replace, new_suffix in suffixes.items():
                if col_name.endswith(suffix_to_replace):
                    return col_name.replace(suffix_to_replace, new_suffix)
            return col_name

            # total = pd.concat([total, subdf], ignore_index=True)

        # min_h1 = h1_df['abstime'].min()
        # min_h2 = h2_df['abstime'].min()
        logging.debug("Comparing %f (h1) with %f (h2)" % (min_h1, min_h2))
        if min_h1 < min_h2:
            logging.debug("Looks like h1 is the tcp client")
            # suffixes = { HOST1_SUFFIX: SENDER_SUFFIX, HOST2_SUFFIX: RECEIVER_SUFFIX }
            h1_role = ConnectionRoles.Client

        else:
            logging.debug("Looks like h2 is the tcp client")
            # suffixes = { HOST2_SUFFIX: SENDER_SUFFIX, HOST1_SUFFIX: RECEIVER_SUFFIX }
            h1_role = (ConnectionRoles.Server)

        print("renaming")
        # _rename_columns(role)
        for tcpdest, tdf in subdf.groupby(_first("tcpdest"), sort=False):
            if tcpdest == h1_role:
                suffixes = {
                    HOST2_SUFFIX: SENDER_SUFFIX,
                    HOST1_SUFFIX: RECEIVER_SUFFIX
                }
            else:
                suffixes = {
                    HOST1_SUFFIX: SENDER_SUFFIX,
                    HOST2_SUFFIX: RECEIVER_SUFFIX
                }

            rename_func = functools.partial(_rename_column, suffixes=suffixes)
            print("renaming inplace")

            tdf.rename(columns=rename_func, inplace=True)
            total = pd.concat([total, tdf], ignore_index=True)

        # subdf[ _first("tcpdest") == ConnectionRole.Client] .rename(columns=_rename_cols, inplace=True)
        print(subdf.columns)
        print(total.columns)

    logging.debug("Converted to sender/receiver format")
    return total
Exemplo n.º 7
0
def load_merged_streams_into_pandas(
        pcap1: str,
        pcap2: str,
        streamid1: int,  # Union[MpTcpStreamId, TcpStreamId],
        streamid2: int,
        mptcp: bool,
        tshark_config: TsharkConfig,
        clock_offset1: int = 0,
        clock_offset2: int = 0,
        mapping_mode: PacketMappingMode = PacketMappingMode.HASH,
        **extra):
    """
    Arguments:
        protocol: mptcp or tcp

        mapping_mode: Only HASH works for now

    Returns
        a dataframe with columns... owd ?
    """
    log.debug(
        "Asked to load merged tcp streams %d and %d from pcaps %s and %s" %
        (streamid1, streamid2, pcap1, pcap2))

    cache = mp.get_cache()
    protocolStr = "mptcp" if mptcp else "tcp"

    cacheid = cache.cacheuid(
        "merged", [
            getrealpath(pcap1),
            getrealpath(pcap2),
        ], protocolStr + "_" + str(streamid1) + "_" + str(streamid2) + ".csv")

    # if we can't load that file from cache
    try:
        merged_df = pd.DataFrame()
        res = pd.DataFrame()

        valid, cachename = cache.get(cacheid)
        log.info("Cache validity=%s and cachename=%s" % (valid, cachename))

        # TODO disable when clock_offset is set
        if not valid:
            df1 = load_into_pandas(pcap1,
                                   tshark_config,
                                   clock_offset=clock_offset1)
            df2 = load_into_pandas(pcap2,
                                   tshark_config,
                                   clock_offset=clock_offset2)

            main_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            other_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            if mptcp:
                main_connection = MpTcpConnection.build_from_dataframe(
                    df1, streamid1)
                other_connection = MpTcpConnection.build_from_dataframe(
                    df2, streamid2)

                # TODO generate
                # map_mptcp_connection()

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_mptcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            else:
                main_connection = TcpConnection.build_from_dataframe(
                    df1, streamid1)
                other_connection = TcpConnection.build_from_dataframe(
                    df2, streamid2)

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_tcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            assert cachename
            logging.info("Saving into %s" % cachename)
            # trying to export lists correctly
            # print(merged_df.reinjected_in.dropna().head())
            # convert arrays back to strings
            # merged_df.apply(",".join()
            merged_df.to_csv(
                cachename,
                # columns=columns,
                index=False,
                header=True,
                sep=tshark_config.delimiter,
            )

            # la on a perdu tcpdest est devenu object
            print("saving with dtypes=", dict(merged_df.dtypes))
            # print("MERGED_DF", merged_df[TCP_DEBUG_FIELDS].head(20))

            # if log level >= DEBUG then save to xls too !
            # if True:
            #     filename = cachename + ".xls"
            #     logging.debug("Saved a debug excel copy at %s" % filename)
            #     merged_df.to_excel(filename)

        else:
            logging.info("Loading from cache %s" % cachename)

            # dtypes = {k: v for k, v in temp.items() if v is not None or k not in ["tcpflags"]}

            def _gen_dtypes(fields) -> Dict[str, Any]:
                dtypes = {}  # type: ignore
                for _name in [_first, _second]:

                    # TODO this could be simplified
                    for k, v in fields.items():
                        if v is not None or k not in ["tcpflags"]:
                            dtypes.setdefault(_name(k), v)

                    # add generated field dtypes
                    dtypes.update({
                        _name(f.fullname): f.type
                        for f in per_pcap_artificial_fields.values()
                    })

                # these are overrides from the generated dtypes
                dtypes.update({
                    # during the merge, we join even unmapped packets so some entries
                    # may be empty => float64
                    _first("packetid"):
                    np.float64,
                    _second("packetid"):
                    np.float64,
                })

                return dtypes

            def _gen_converters() -> Dict[str, Callable]:

                # converters = {}   # type: Dict[str, Any]
                fields = dict(tshark_config.fields)
                fields.update(per_pcap_artificial_fields)
                converters = {}
                # no need to convert tcpflags
                default_converters = {
                    name: f.converter
                    for name, f in fields.items()
                    if f.converter and name != "tcpflags"
                }
                # converters.update({ name: f.converter for name, f in per_pcap_artificial_fields.items() if f.converter})
                for name, converter in default_converters.items():
                    converters.update({
                        _first(name): converter,
                        _second(name): converter
                    })

                return converters

            with open(cachename) as fd:
                dtypes = _gen_dtypes({
                    name: field.type
                    for name, field in tshark_config.fields.items()
                })
                converters = _gen_converters()
                # more recent versions can do without it
                # pd.set_option('display.max_rows', 200)
                # pd.set_option('display.max_colwidth', -1)
                # print("converters=", converters)
                merged_df = pd.read_csv(
                    fd,
                    skip_blank_lines=True,
                    comment='#',
                    # we don't need 'header' when metadata is with comment
                    sep=tshark_config.delimiter,
                    # memory_map=True, # could speed up processing
                    dtype=dtypes,  # poping still generates
                    converters=converters,
                )

                # log.debug("Column names after loading from cache: %s", merged_df.columns)

                # TODO:
                # No columns to parse from file

        # we fix the clocks a posteriori so that the cache is still usable

        logging.debug("Postprocessing clock if needed")
        merged_df[_first('abstime')] += clock_offset1
        merged_df[_second('abstime')] += clock_offset2

        logging.debug("Converting dataframes to be sender/receiver based...")
        # in both cases
        # TODO here we should attribute the definite mptcprole
        # compute owd
        if mptcp:
            print("Should be merging OWDs")
            logging.error(
                "We should correct the clocks if the argument is passed !")
            # raise mp.MpTcpException("Implement mptcp merge")

            res = convert_to_sender_receiver(merged_df)
        else:
            # tcp
            # c la ou ou corrige les temps
            # on rename les colonnes host1 ou host2 par _sender ou bien _receiver ?!
            res = convert_to_sender_receiver(merged_df)

            # don't do it here else we might repeat it
            # data["abstime"] += clock_offset

        logging.debug("Computing owds")
        log.debug("Column names: %s", res.columns)
        log.debug("Dtypes after load:%s\n" % dict(res.dtypes))
        print("res=")
        # TODO we don't necessarely need to generate the OWDs here, might be put out
        res['owd'] = res[_receiver('abstime')] - res[_sender('abstime')]
        # .head(40))
        with pd.option_context('float_format', '{:f}'.format):
            print(res[_sender(["ipsrc", "ipdst", "abstime"]) +
                      _receiver(["abstime", "packetid"]) + TCP_DEBUG_FIELDS +
                      ["owd"]])

    except Exception:
        logging.exception("exception happened while merging")

    # pd.set_option('display.max_rows', 200)
    # pd.set_option('display.max_colwidth', -1)
    # print("dtypes=", dict(dtypes))
    # log.debug("Dtypes after load:%s\n" % pp.pformat(merged_df.dtypes))
    log.info("Finished loading. merged dataframe size: %d" % len(merged_df))

    return res
Exemplo n.º 8
0
def merge_tcp_dataframes_known_streams(
        con1: Tuple[pd.DataFrame, TcpConnection],
        con2: Tuple[pd.DataFrame, TcpConnection]) -> pd.DataFrame:
    """
    Generates an intermediate file with the owds.

    1/ clean up dataframe to keep
    2/ identify which dataframe is server's/client's

    Args:
        con1: Tuple dataframe/tcpstream id
        con2: same

    Returns:
        A dataframe with a "merge_status" column and valid tcp/mptcp destinations
        To ease debug we want to see packets in chronological order
    """
    h1_df, main_connection = con1
    h2_df, mapped_connection = con2

    log.info(
        "Trying to merge connection {} to {} of respective sizes {} and {}".
        format(mapped_connection, main_connection, len(h1_df), len(h2_df)))

    # cleanup the dataframes to contain only the current stream packets
    h1_df = h1_df[h1_df.tcpstream == main_connection.tcpstreamid]
    h2_df = h2_df[h2_df.tcpstream == mapped_connection.tcpstreamid]

    # TODO reorder columns to have packet ids first !
    total = pd.DataFrame()

    for tcpdest in ConnectionRoles:

        log.debug("Merging tcp destination %s" % tcpdest)
        q = main_connection.generate_direction_query(tcpdest)
        h1_unidirectional_df = h1_df.query(q, engine="python")
        q = mapped_connection.generate_direction_query(tcpdest)
        h2_unidirectional_df = h2_df.query(q, engine="python")

        res = map_tcp_packets(h1_unidirectional_df, h2_unidirectional_df)

        # pandas trick to avoid losing dtype
        # see https://github.com/pandas-dev/pandas/issues/22361#issuecomment-413147667
        # no need to set _second (as they are just opposite)
        # TODO this should be done somewhere else
        # else summary won't work
        res[_first('tcpdest')][:] = tcpdest
        res[_second('tcpdest')][:] = tcpdest

        # generate_mptcp_direction_query
        # TODO this is not always reached ?
        log.info("con of TYPE %r", main_connection)
        if isinstance(main_connection, MpTcpSubflow):

            log.debug("This is a subflow, setting mptcp destinations...")
            mptcpdest = main_connection.mptcp_dest_from_tcpdest(tcpdest)
            log.debug("Setting mptcpdest to {mptcpdest}")
            res[_first('mptcpdest')][:] = mptcpdest
            res[_second('mptcpdest')][:] = mptcpdest

            log.debug("Setting mptcpdest to %s" % mptcpdest)

        total = pd.concat([res, total])
        debugcols = _first(["abstime", "tcpdest", "mptcpdest"]) + \
        _second(["abstime", "tcpdest", "mptcpdest"])
        debug_dataframe(total, "concatenated df", usecols=debugcols)

    log.info(
        "Resulting merged tcp dataframe of size {} ({} mapped packets vs {} unmapped)"
        "with input dataframes of size {} and {}.".format(
            len(total), len(total[total.merge_status == "both"]),
            len(total[total.merge_status != "both"]), len(h1_df), len(h2_df)))

    # print("unmapped packets:")
    # print(total.loc[total._merge != "both", _sender(TCP_DEBUG_FIELDS) + _receiver(TCP_DEBUG_FIELDS) ])
    return total
Exemplo n.º 9
0
def convert_to_sender_receiver(df) -> pd.DataFrame:
    """
    Convert dataframe from  X_HOST1 | X_HOST2 to X_SENDER | X_RECEIVER

    each packet has a destination marker
    Assume clocks are fine here !
    """
    log.debug("Converting from host_1/host_2 to sender/receiver format")

    # fill up afterwards
    total = pd.DataFrame()

    for tcpstream, subdf in df.groupby(_first("tcpstream")):

        min_h1 = subdf.iloc[0, subdf.columns.get_loc(_first('abstime'))]
        min_h2 = subdf.iloc[0, subdf.columns.get_loc(_second('abstime'))]

        #         def _rename_columns(h1_role: ConnectionRoles):
        #             """
        # client_suffix, server_suffix
        #             Params:
        #                 client_suffix must be one of HOST1_SUFFIX or HOST2_SUFFIX
        #                 server_suffix can be deduced
        #             """
        def _rename_column(col_name, suffixes) -> str:

            for suffix_to_replace, new_suffix in suffixes.items():
                if col_name.endswith(suffix_to_replace):
                    return col_name.replace(suffix_to_replace, new_suffix)
            return col_name

            # total = pd.concat([total, subdf], ignore_index=True)

        log.debug(f"Comparing {min_h1} (h1) with {min_h2} (h2)")

        assert min_h1 != min_h2, (
            f"Same sending {min_h1} and receiving time {min_h2}."
            "Either the clock is not precise enough or it's a bug"
            " (more likely)")
        if min_h1 < min_h2:
            log.debug("Looks like h1 is the tcp client")
            # suffixes = { HOST1_SUFFIX: SENDER_SUFFIX, HOST2_SUFFIX: RECEIVER_SUFFIX }
            h1_role = ConnectionRoles.Client

        else:
            if min_h1 == min_h2:
                log.warn("there is an issue")

            log.debug("Looks like h2 is the tcp client")
            h1_role = (ConnectionRoles.Server)

        # _rename_columns(role)
        for tcpdest, tdf in subdf.groupby(_first("tcpdest"), sort=False):
            if tcpdest == h1_role:
                suffixes = {
                    HOST2_SUFFIX: SENDER_SUFFIX,
                    HOST1_SUFFIX: RECEIVER_SUFFIX
                }
            else:
                suffixes = {
                    HOST1_SUFFIX: SENDER_SUFFIX,
                    HOST2_SUFFIX: RECEIVER_SUFFIX
                }

            log.debug("suffixes: %s" % suffixes)
            rename_func = functools.partial(_rename_column, suffixes=suffixes)
            log.log(mp.TRACE, "renaming inplace")
            rename_func = functools.partial(_rename_column, suffixes=suffixes)

            log.debug("total df size = %d" % len(total))
            with pd.option_context('precision', 20):
                debug_cols = _first(["abstime", "tcpdest"]) + _second(
                    ["abstime", "tcpdest"])
                log.log(mp.TRACE, "before rename \n%s", tdf[debug_cols])
                tdf = tdf.rename(columns=rename_func, copy=True, inplace=False)

                debug_cols = _sender(["abstime", "tcpdest"]) + _receiver(
                    ["abstime", "tcpdest"])
                log.log(mp.TRACE, "After rename \n%s" % tdf[debug_cols])
                # print(tdf[debug_cols])
                # debug_dataframe(tdf, "temporary dataframe")
                total = pd.concat(
                    [total, tdf],
                    ignore_index=True,
                    sort=False,
                )
                # print("total df size = %d" % len(total))

        # subdf[ _first("tcpdest") == ConnectionRole.Client] .rename(columns=_rename_cols, inplace=True)
        # print(subdf.columns)
        # print(total.columns)
    # debug_dataframe(total, "total")

    log.debug("Converted to sender/receiver format")
    log.log(mp.TRACE, "Comparing #unique entries %d vs #all %d",
            total[_sender("abstime")].count(), len(total[_sender("abstime")]))
    # assert total[_sender("abstime")].count() == len(total[_sender("abstime")])
    return total
Exemplo n.º 10
0
def load_merged_streams_into_pandas(
        pcap1: str,
        pcap2: str,
        streamid1: int,
        streamid2: int,
        # TODO changed to protocol
        mptcp: bool,
        tshark_config: TsharkConfig,
        clock_offset1: int = 0,
        clock_offset2: int = 0,
        mapping_mode: PacketMappingMode = PacketMappingMode.HASH,
        **extra):
    """
    Arguments:
        protocol: mptcp or tcp
        mapping_mode: Only HASH works for now
        clock_offset: untested

    Returns
        a dataframe with columns... owd ?
    """
    protocolStr = "mptcp" if mptcp else "tcp"
    log.debug(f"Asked to load {protocolStr} merged streams {streamid1} and "
              "{streamid2} from pcaps {pcap1} and {pcap2}")

    cache = mp.get_cache()

    cacheid = cache.cacheuid(
        "merged", [getrealpath(pcap1), getrealpath(pcap2)],
        protocolStr + "_" + str(streamid1) + "_" + str(streamid2) + ".csv")

    # if we can't load that file from cache
    try:
        merged_df = pd.DataFrame()
        res = pd.DataFrame()

        valid, cachename = cache.get(cacheid)
        log.info("Cache validity=%s and cachename=%s" % (valid, cachename))

        # TODO disable when clock_offset is set
        if not valid:
            df1 = load_into_pandas(pcap1,
                                   tshark_config,
                                   clock_offset=clock_offset1)
            df2 = load_into_pandas(pcap2,
                                   tshark_config,
                                   clock_offset=clock_offset2)

            main_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            other_connection = None  # type: Union[MpTcpConnection, TcpConnection]
            if mptcp:
                main_connection = MpTcpConnection.build_from_dataframe(
                    df1, MpTcpStreamId(streamid1))
                other_connection = MpTcpConnection.build_from_dataframe(
                    df2, MpTcpStreamId(streamid2))

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_mptcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            else:
                main_connection = TcpConnection.build_from_dataframe(
                    df1, TcpStreamId(streamid1))
                other_connection = TcpConnection.build_from_dataframe(
                    df2, TcpStreamId(streamid2))

                # for now we use known streams exclusively
                # might be interested to use merge_tcp_dataframes later
                merged_df = merge_tcp_dataframes_known_streams(
                    (df1, main_connection), (df2, other_connection))

            assert cachename
            log.info("Saving into %s" % cachename)
            # trying to export lists correctly
            # print(merged_df.reinjected_in.dropna().head())
            # convert arrays back to strings
            # merged_df.apply(",".join()
            # or abstime ?

            # TODO rechange the flags hex()
            merged_df.to_csv(
                cachename,
                # columns=columns,
                index=False,
                header=True,
                sep=tshark_config.delimiter,
            )

            # tcpdest had become an objected instead of a CategoricalDtype
            # see https://github.com/pandas-dev/pandas/issues/22361
            log.log(mp.TRACE, "saving with dtypes=", dict(merged_df.dtypes))

        else:
            log.info("Loading from cache %s", cachename)

            date_cols = get_date_cols(tshark_config.fields)

            with open(cachename) as fd:
                # generate fieldlist
                def _gen_fields(fields):
                    gfields = {}  # type: ignore
                    for _name in [_first, _second]:
                        gfields.update(
                            {_name(k): v
                             for k, v in fields.items()})
                    return gfields

                # reltime discarded on save ?
                tshark_config.fields.pop("reltime")
                gfields = _gen_fields(tshark_config.fields)
                merge_dtypes = get_dtypes(gfields)
                # log.log(mp.TRACE, "Using gfields %s" % pp.pformat(gfields))

                # we don't need any converters
                converters = {}
                date_cols = get_date_cols(gfields)

                log.log(mp.TRACE, "Using date_cols %s" % pp.pformat(date_cols))
                log.log(mp.TRACE, "Using dtypes %s" % pp.pformat(merge_dtypes))
                # log.log(mp.TRACE, "Using converters %s" % (pp.pformat(converters)))
                merged_df = pd.read_csv(
                    fd,
                    skip_blank_lines=True,
                    comment='#',
                    # we don't need 'header' when metadata is with comment
                    sep=tshark_config.delimiter,
                    # memory_map=True, # could speed up processing
                    dtype=merge_dtypes,  # poping still generates
                    converters=converters,
                    # date_parser=date_converter,
                    parse_dates=date_cols,
                )
                # at this stage, destinatiosn are nan

                debug_fields = ["abstime", "tcpstream", "tcpdest", "mptcpdest"]
                mptcpanalyzer.debug.debug_dataframe(
                    merged_df,
                    "Merged dataframe",
                    usecols=(_first(debug_fields) + _second(debug_fields)))

                # workaround bug https://github.com/pandas-dev/pandas/issues/25448
                def _convert_to_enums():
                    # per_pcap_artificial_fields
                    for col in [
                            _first("tcpdest"),
                            _first("mptcpdest"),
                            _second("tcpdest"),
                            _second("mptcpdest")
                    ]:
                        merged_df[col] = merged_df[col].apply(
                            _convert_role, convert_dtype=False)

        # we fix the clocks a posteriori so that the cache is still usable
        log.debug("Postprocessing clock if needed")
        # merged_df[_first('abstime')] += clock_offset1
        # merged_df[_second('abstime')] += clock_offset2

        log.debug("Converting dataframes to be sender/receiver based...")

        # in both cases
        # TODO here we should attribute the definite mptcprole
        if mptcp:
            log.error(
                "We should correct the clocks if the argument is passed !")
            # raise mp.MpTcpException("Implement mptcp merge")

            res = convert_to_sender_receiver(merged_df)
            # fill MPTCP dest ?
        else:
            # tcp
            res = convert_to_sender_receiver(merged_df)

        # log.debug("Sorting by sender abstime")
        # merged_df.sort_values(by=_sender("abstime"), ascending=True, inplace=True)
        # debug_dataframe(res, "checking merge", usecols=["merge_status"])
        # print("%d nan values" % len(res[res.merge_status == np.nan]))

        log.debug("Computing owds")

        debug_dataframe(res, "before owds")
        # TODO we don't necessarely need to generate the OWDs here, might be put out
        res['owd'] = res[_receiver('abstime')] - res[_sender('abstime')]

        debug_dataframe(
            res,
            "owd",
            usecols=["owd", _sender('abstime'),
                     _receiver('abstime')])
        # with pd.option_context('float_format', '{:f}'.format):
        #     print(
        #         res[_sender(["ipsrc", "ipdst", "abstime"])
        #          + _receiver(["abstime", "packetid"]) + TCP_DEBUG_FIELDS + ["owd"] ]
        #     )

    except Exception as e:
        log.exception("exception happened while merging")

    # pd.set_option('display.max_rows', 200)
    # pd.set_option('display.max_colwidth', -1)
    # print("dtypes=", dict(dtypes))
    log.log(mp.TRACE, "Dtypes after load:%s\n", pp.pformat(res.dtypes))
    log.info("Finished loading. merged dataframe size: %d", len(res))

    return res