示例#1
0
    def test_setting_fileh(self):
        DATA_FILE = "pyfsdb/tests/tests.fsdb"
        f = pyfsdb.Fsdb()

        self.assertFalse(f.file_handle, "file_handle should not be available")

        fh = open(DATA_FILE, "r")
        self.assertTrue(fh, "file opened manually")

        f.file_handle = fh
        self.assertTrue(f.file_handle == fh, "file_handle was set properly")

        row = next(f)
        self.assertTrue(f.__next__ == f._next_as_array, "read type was set")
        self.assertTrue(row, 'row one is returned')
        self.assertTrue(row[0] == 'rowone')

        # create a new object instead
        fh = open(DATA_FILE, "r")
        f = pyfsdb.Fsdb(file_handle=fh)

        row = next(f)
        self.assertTrue(row, 'row one is returned')
        self.assertTrue(row[0] == 'rowone')

        # check that it works as an iterator
        fh = open(DATA_FILE, "r")
        f = pyfsdb.Fsdb(file_handle=fh)

        count = 0
        for row in f:
            count += 1

        self.assertTrue(count > 0, "at least one row read")
示例#2
0
def main():
    args = parse_args()

    # set up storage structures
    storage = {}
    columns = {}

    # from the input, get extract column numbers/names
    key_column = args.key_column
    value_column = args.value_column
    other_columns = args.other_columns
    columns = args.columns

    # open the input file stream
    input = pyfsdb.Fsdb(file_handle=args.input_file,
                        return_type=pyfsdb.RETURN_AS_DICTIONARY)
    output = pyfsdb.Fsdb(out_file_handle=args.output_file)
    output.out_column_names = [key_column, value_column] + other_columns

    # for each row, remember each value based on time and key
    for row in input:
        for column in columns:
            out_row = [column, row[column]]
            for other in other_columns:
                out_row.append(row[other])
            output.append(out_row)

    output.close()
示例#3
0
def main():
    args = parse_args()

    cmd2template = get_cmd2template(args.input_file[0])
    weightDic, cmdIPsDic, sourceDic, cmdToArray = get_info(
        args.input_file[0], cmd2template)
    G, weighted_edges, labels = draw_networkx(args, weightDic, cmdIPsDic,
                                              sourceDic, cmdToArray)
    clusters = get_clusters(G)

    if (args.edge_list):
        outh = pyfsdb.Fsdb(out_file=args.edge_list)
        outh.out_column_names = [
            'cluster_id', 'node1_id', 'node2_id', 'node1', 'node2', 'weight'
        ]
        for cmd1, cmd2, weight in weighted_edges:
            cluster_id = clusters[cmd1]
            num1 = labels[cmd1]
            num2 = labels[cmd2]
            outh.append([cluster_id, num1, num2, cmd1, cmd2, round(weight, 3)])
        outh.close()

    if (args.cluster_list):
        outh = pyfsdb.Fsdb(out_file=args.cluster_list)
        outh.out_column_names = ['cluster_id', 'command']
        for cmd, cluster_id in clusters.items():
            outh.append([cluster_id, cmd])
        outh.close()
示例#4
0
def main():
    args = parse_args()

    f = pyfsdb.Fsdb(args.input_file)
    key_column = f.get_column_number(args.column)
    key_counts = {}

    line_count = 0
    cached_count = 0

    # memorize all the keys and the number of rows in each
    for row in f.next_as_array():
        line_count += 1
        if row[key_column] not in key_counts:
            key_counts[row[key_column]] = 1
        else:
            key_counts[row[key_column]] += 1

    key_list = list(key_counts.keys())
    key_list.sort()

    # re-open the input file to re-read
    f = pyfsdb.Fsdb(args.input_file, out_file_handle=args.output_file)

    stored_lines = {}

    # memorize all the keys and the number of rows in each
    current_key = key_list.pop(0)
    for row in f.next_as_array():
        if row[key_column] != current_key:
            # the current lines are arriving too early; cache them
            cached_count += 1
            if row[key_column] not in stored_lines:
                stored_lines[row[key_column]] = [row]
            else:
                stored_lines[row[key_column]].append(row)
        else:
            f.append(row)
            key_counts[current_key] -= 1

            while key_counts[current_key] == 0:
                #import pdb; pdb.set_trace()
                # we're done with this key list
                if len(key_list) == 0:
                    break  # done!

                # grab a new key
                current_key = key_list.pop(0)

                # write out any cached lines
                if current_key in stored_lines:
                    for stored_row in stored_lines[current_key]:
                        f.append(stored_row)
                        key_counts[current_key] -= 1
                    del stored_lines[current_key]

    f.write_finish()

    if args.verbose:
        sys.stderr.write("cached %d/%d lines\n" % (cached_count, line_count))
示例#5
0
    def read_data(self):
        """Read data from the ip2asn file"""
        if isinstance(self._file, str):
            # assume a file name
            iptoasn = pyfsdb.Fsdb(self._file)
        else:
            # assume it's a file handle instead
            iptoasn = pyfsdb.Fsdb(file_handle=self._file)

        # set the column names for pyfsdb
        iptoasn.column_names = ['start', 'end', 'ASN', 'country', 'name']

        (self._start_col, self._end_col, self._asn_col, self._country_col,
         self._name_col) = iptoasn.get_column_numbers(iptoasn.column_names)

        # XXX: fsdb should do this for us
        self._data = []
        self._left_keys = []
        for row in iptoasn:
            try:
                row[self._start_col] = int(row[self._start_col])
                row[self._end_col] = int(row[self._end_col])
            except:
                # must be addresses not ints
                row[self._start_col] = self.ip2int(row[self._start_col])
                row[self._end_col] = self.ip2int(row[self._end_col])

            self._data.append(row)
            self._left_keys.append(int(row[self._start_col]))
示例#6
0
文件: main.py 项目: hardaker/ip2asn
def process_fsdb(i2a, inh, outh, key, by_asn=False):
    inf = pyfsdb.Fsdb(file_handle=inh)
    outf = pyfsdb.Fsdb(out_file_handle=outh)
    if by_asn:
        outf.out_column_names = inf.column_names + ASN_COLUMN_NAMES[1:]
    else:
        outf.out_column_names = inf.column_names + COLUMN_NAMES[1:]

    key_col = inf.get_column_number(key)
    for row in inf:
        if by_asn:
            results = i2a.lookup_asn(row[key_col], limit=1)
            if len(results) == 0:
                row.extend(['-', '-', '-', '-', '-'])
            else:
                row.extend([
                    results[0]['owner'], results[0]['country'],
                    results[0]['ip_range']
                ])

        else:
            result = i2a.lookup_address(row[key_col])

            if result:
                row.extend([
                    result['ip_numeric'], result['ASN'], result['owner'],
                    result['country'], result['ip_range']
                ])
            else:
                row.extend(['-', '-', '-', '-', '-'])
        outf.append(row)
示例#7
0
def main():
    args = parse_args()

    # open the input file
    inh = pyfsdb.Fsdb(file_handle=args.input_file)
    key_column = inh.get_column_number(args.column)

    out_handles = {}

    for row in inh:
        value = row[key_column]

        # see if we have an open file handle for this one yet
        if value not in out_handles:
            # new value, so open a new file handle to save data for it
            file_name = re.sub("[^-.0-9a-zA-Z_]", "_", str(value))
            outh = pyfsdb.Fsdb(out_file=(args.output_pattern % file_name))
            outh.column_names = inh.column_names
            out_handles[value] = outh

        # save the row to the file based on its value
        out_handles[value].append(row)

    # clean up
    for handle in out_handles:
        out_handles[handle].close()
示例#8
0
    def test_write_out_fsdb(self):
        DATA_FILE = "pyfsdb/tests/tests.fsdb"
        OUT_FILE = "pyfsdb/tests/testout.fsdb"

        f = pyfsdb.Fsdb(DATA_FILE, out_file=OUT_FILE)
        self.assertTrue(f, "opened ok")

        # read in all records
        records = []
        for record in f:
            records.append(record)

        self.assertTrue(records[0][0] == 'rowone',
                        'init record ' + records[0][0] + ' is correct')

        for record in records:
            f.write_row(record)

        f.write_finish()

        g = pyfsdb.Fsdb(OUT_FILE)
        rows = []
        for row in g:
            rows.append(row)
        self.check_data(rows)

        # write out new columns
        f = pyfsdb.Fsdb(out_file=OUT_FILE)
        count = 1

        f.out_column_names = ['a', 'b', 'c', 'new_count']
        self.assertTrue(
            len(f.out_column_names) == 4, "correct initial output count")
        for row in rows:
            row.append(str(count))
            f.write_row(row)
            count = count + 1
        f.write_finish()

        # check new columns
        g = pyfsdb.Fsdb(filename=OUT_FILE)
        rows = []
        for row in g:
            rows.append(row)
        self.check_data(rows)
        self.assertTrue(rows[0][3] == "1", "new rowone col is correct")
        self.assertTrue(rows[1][3] == "2", "new rowtwo col is correct")

        # check the output token switch
        f = pyfsdb.Fsdb(DATA_FILE, out_file=OUT_FILE)
        self.assertTrue(f, "opened ok")
        f.out_separator_token = "s"
        self.assertTrue(f.out_separator == ' ', "new separator is space")
        for row in f:
            f.write_row(row)
        f.write_finish()
示例#9
0
def main():
    args = parse_args()

    # set up storage structures
    storage = {}
    columns = {}

    # open the input file stream
    input = pyfsdb.Fsdb(file_handle=args.input_file)

    # from the input, get extract column numbers/names
    time_column = input.get_column_number(args.time_column)
    key_column = input.get_column_number(args.key_column)
    column_names = input.column_names

    # for each row, remember each value based on time and key
    for row in input:
        # if the time hasn't been seen before, allocate the sub-structure
        if row[time_column] not in storage:
            storage[row[time_column]] = {}

        for column_num in range(0, len(row)):
            # remember all values of non-time and non-key columns
            if column_num != time_column and column_num != key_column:
                storage[row[time_column]][row[key_column]] = row[column_num]
                # record that we've seen this column before
                columns[row[key_column]] = 1

    # open the output stream, and set it's properties
    out = pyfsdb.Fsdb(out_file_handle=args.output_file)

    # the output columns will be a merge of the time column, and
    # previously seen key-index values.
    output_columns = ['time']
    output_columns.extend(columns.keys())
    out.out_column_names = output_columns

    # Output all data, grouped by time_key
    for time_key in storage:

        # create a row containing a column for every seen key
        row = [time_key]
        for column in columns:
            if column not in storage[time_key] or storage[time_key][
                    column] == "":
                row.append("0")
            else:
                row.append(storage[time_key][column])

        # write it out
        out.append(row)
示例#10
0
    def test_missing_header_support_file(self):
        DATA_FILE = "pyfsdb/tests/noheader.fsdb"
        f = pyfsdb.Fsdb(DATA_FILE)
        self.assertTrue(f, "opened ok")
        f.column_names = ['colone', 'coltwo', 'colthree']

        headers = f.headers
        self.assertTrue(headers, "headers access exists")

        self.assertTrue(f.get_column_name(0) == "colone")
        self.assertTrue(f.get_column_name(1) == "coltwo")
        self.assertTrue(f.get_column_name(2) == "colthree")
        self.assertTrue(f.get_column_number("colone") == 0)
        self.assertTrue(f.get_column_number("coltwo") == 1)
        self.assertTrue(f.get_column_number("colthree") == 2)

        self.assertTrue(f.header_line == "#fsdb -F t colone coltwo colthree\n")

        cols = f.column_names
        self.assertTrue(len(cols) == 3, "There are two cloumns")
        self.assertTrue(cols[0] == "colone", "column one ok")
        self.assertTrue(cols[1] == "coltwo", "column two ok")
        self.assertTrue(cols[2] == "colthree", "column three ok")
        self.assertTrue(cols[2] == "colthree", "column three ok")
        self.assertTrue(f.column_names[2] == "colthree", "column three ok")
示例#11
0
    def test_setting_columns(self):
        f = pyfsdb.Fsdb()
        self.assertTrue(f, "opened ok")

        testcols = ['colone', 'coltwo', 'col3']
        f.column_names = testcols
        self.assertTrue(f.column_names == testcols)
示例#12
0
    def test_read_header(self):
        HEADER_FILE = "pyfsdb/tests/tests.fsdb"
        f = pyfsdb.Fsdb()
        fileh = open(HEADER_FILE, "r")
        line = next(fileh)
        headers = f.read_header(line)

        self.assertTrue(headers[0] == 0, "header parse is 0 for success")

        header_info = headers[1]

        for colname in ('names', 'numbers', 'header'):
            self.assertTrue(colname in header_info,
                            "header structure contains " + colname)

        names_info = header_info['names']
        numbers_info = header_info['numbers']

        counter = 0
        for column in ('colone', 'coltwo', 'colthree'):
            self.assertTrue(column in names_info,
                            "column info contains data on " + column)
            self.assertTrue(names_info[column] == counter,
                            "column " + column + " is number " + str(counter))

            self.assertTrue(
                numbers_info[counter] == column,
                "column number " + str(counter) + " is labeled " + column)

            counter += 1
示例#13
0
def output_to_fsdb(chart_data, output_file_name, column_names):
    """Writes the chart as a FSDB file with start, end, and height values"""
    outh = pyfsdb.Fsdb(out_file=output_file_name)
    outh.out_column_names = column_names + ['height']
    for row in chart_data:
        outh.append(row)
    outh.close()
示例#14
0
    def test_dont_save_command(self):
        f = pyfsdb.Fsdb(out_file=self.OUT_FILE)
        f.out_command_line = None
        f.out_file_handle.write("#   | test nowrite\n")
        del f

        self.check_last_line(self.OUT_FILE, "#   | test nowrite\n")
示例#15
0
def json_to_fsdb(input_file, output_file):
    """A function that converts an input file stream of json dictionary
    to an output FSDB file, where the header column names are pulled
    from the first record keys."""
    first_line = next(input_file)

    try:
        rows = json.loads(first_line)
        if not isinstance(rows, list):
            rows = [rows]
    except Exception as exp:
        sys.stderr.write("failed to parse the first line as json:\n")
        sys.stderr.write(first_line)
        sys.stderr.write(str(exp))
        sys.exit(1)

    columns = sorted(list(rows[0].keys()))
    out_fsdb = pyfsdb.Fsdb(out_file_handle=output_file)
    out_fsdb.out_column_names = columns
    handle_rows(out_fsdb, rows, columns)

    for line in input_file:
        try:
            rows = json.loads(line)
            if not isinstance(rows, list):
                rows = [rows]
            handle_rows(out_fsdb, rows, columns)
        except Exception as exp:
            sys.stderr.write("failed to parse: " + line)
示例#16
0
def get_info(input_file, cmd2template):
    """ Return four dictionaries: (1) weights between commands, (2) IPs that ran commands, (3) sources for each command, and (4) command to array style string
    Input: input_file (str) - FSDB file with IP and command data, template_file (str) - JSON file with templatized commands
    Output: weightDic (dict) - key: pair of commands (tuple) / value: weight (float), cmdIPsDic (dict) - key: command (str) / value: dictionary with key: source (str) & value: IPs that ran command (list),
    sourceDic (dict) - key: command (str) / value: source label (str), cmdToArray (dict) - key: command (str) / value: array style command (str)
    """
    db = pyfsdb.Fsdb(input_file)
    df = db.get_pandas(data_has_comment_chars=True)

    df["command"] = df["command"].apply(lambda x: str([x])
                                        if x[0] != "[" else x)
    loggedInOnly = get_loggedInOnly(df)

    df2 = df.copy()[~df["ip"].isin(loggedInOnly)]
    df2 = df2[df2["command"] != '[]']

    cmdIPsDic = get_cmdIPsDic(input_file, loggedInOnly)
    templates = get_templates(cmd2template)
    cmds = list(df2["command"].unique())

    unique_cmds, cmdIPsDic = get_uniqueCmds(cmds, cmdIPsDic, templates)

    cmdToArray = {cmd[2:-2]: cmd for cmd in unique_cmds}
    unique_cmds = [cmd[2:-2] for cmd in unique_cmds]

    distDic = get_distances(unique_cmds)
    weightDic = get_weights(distDic)
    sourceDic = {
        cmd: "+".join(list(cmdIPsDic[cmdToArray[cmd]].keys())) + "_cmd"
        for cmd in unique_cmds
    }

    return weightDic, cmdIPsDic, sourceDic, cmdToArray
示例#17
0
def get_commandCounts(input_file):
    """ Counts number of commands run in the dataset and returns dict with command and respective counts
    Input: input_file (str): FSDB file with IP and command data
    Output: cmdCount (dict): maps command to number of times the cmd appears in the data
    """
    db = pyfsdb.Fsdb(input_file)

    command_index = db.get_column_number("command")
    source_index = db.get_column_number("source")

    cmdCount = {}

    for row in db:
        command = row[command_index]
        source = row[source_index]

        if source == "cowrie":
            command = str([command])

        if command not in cmdCount:
            cmdCount[command] = 1
        else:
            cmdCount[command] += 1

    return cmdCount
示例#18
0
def get_cmdIPsDic(input_file, loggedInOnly):
    """ Returns dict that contains IP addresses that ran the command and from what source
    Input: input_file (str) - FSDB input file, loggedInOnly (list) - list of IPs that only logged in
    Output: cmdIPsDic (dict) - key: command (str) / value: dictionary with key: source (str) & value: IPs that ran command (list)
    """
    cmdIPsDic = {}

    db = pyfsdb.Fsdb(input_file)

    ip_index = db.get_column_number("ip")
    command_index = db.get_column_number("command")
    source_index = db.get_column_number("source")

    for row in db:
        ip = row[ip_index]

        if ip in loggedInOnly:  ## if IP only logged in, do not record
            continue

        source = row[source_index]
        cmd = row[command_index]

        if cmd[0] != "[":
            cmd = str([cmd])

        if cmd not in cmdIPsDic:
            cmdIPsDic[cmd] = {source: [ip]}
        else:
            if source in cmdIPsDic[cmd]:
                if ip not in cmdIPsDic[cmd][source]:
                    cmdIPsDic[cmd][source].append(ip)
            else:
                cmdIPsDic[cmd][source] = [ip]

    return cmdIPsDic
示例#19
0
    def test_comment_ordering(self):
        HEADER_FILE = "pyfsdb/tests/test_comments_at_top.fsdb"
        OUTPUT_FILE = "pyfsdb/tests/test_comments_at_top.test.fsdb"
        f = pyfsdb.Fsdb(filename=HEADER_FILE, out_file=OUTPUT_FILE)
        for row in f:
            f.write_row(row)
        f.write_finish()

        # the headers should fail
        self.assertTrue(True, "got here")

        # load both files fully
        file1 = ""
        with open(HEADER_FILE, "r") as fh:
            file1 = fh.read(8192)

        file2 = ""
        with open(OUTPUT_FILE, "r") as fh:
            file2 = fh.read(8192)

        print("file2:" + file2)

        self.assertTrue(
            file2.startswith(file1),  # ignore added trailers
            "file contents with headers are the same")
示例#20
0
def main():
    args = parse_args()

    fh = pyfsdb.Fsdb(file_handle=args.input_file,
                     out_file_handle=args.output_file)

    store_columns = fh.get_column_numbers(args.columns)
    time_column = fh.get_column_number(args.key_column)
    value = args.value
    bin_size = args.bin_size

    last_index = None

    for row in fh:
        if last_index == None:
            # first row, just store it
            last_index = int(row[time_column])
        elif last_index != int(row[time_column]):
            for skipped_time in range(last_index + bin_size, int(row[time_column]), bin_size):
                newrow = list(row)
                newrow[time_column] = str(skipped_time)
                for column in store_columns:
                    newrow[column] = value
                fh.append(newrow)
            last_index = int(row[time_column])
        fh.append(row)

    fh.write_finish()
示例#21
0
    def test_save_out_command_from_init(self):
        f = pyfsdb.Fsdb(self.DATA_FILE,
                        out_file=self.OUT_FILE,
                        out_command_line="test command init")
        self.assertTrue(f, "opened ok")
        del f

        self.check_last_line(self.OUT_FILE, "#   | test command init\n")
示例#22
0
    def test_save_out_command_on_del(self):
        f = pyfsdb.Fsdb(self.DATA_FILE, out_file=self.OUT_FILE)
        self.assertTrue(f, "opened ok")

        f.out_command_line = "test command on del"
        del f

        self.check_last_line(self.OUT_FILE, "#   | test command on del\n")
示例#23
0
    def test_out_command_line(self):

        f = pyfsdb.Fsdb(self.DATA_FILE, out_file=self.OUT_FILE)
        self.assertTrue(f, "opened ok")

        f.out_command_line = "test command"
        f.write_finish()

        self.check_last_line(self.OUT_FILE, "#   | test command\n")
示例#24
0
 def test_foreach(self):
     from io import StringIO
     data = "#fsdb -F t a b c\n1\t2\t3\n4\t5\t6\n"
     datah = StringIO(data)
     with pyfsdb.Fsdb(file_handle=datah,
                      return_type=pyfsdb.RETURN_AS_DICTIONARY) as f:
         ret = f.foreach(lambda x: x['b'])
         self.assertEqual(ret, ['2', '5'],
                          "foreach response data is correct")
示例#25
0
    def test_with_usage(self):
        DATA_FILE = "pyfsdb/tests/tests.fsdb"
        with pyfsdb.Fsdb(DATA_FILE) as f:
            row = next(f)
            self.assertTrue(row, 'row one is returned')

            self.assertTrue(row[0] == 'rowone')
            self.assertTrue(row[1] == 'info')
            self.assertTrue(row[2] == 'data')
示例#26
0
    def test_array_generator(self):
        f = pyfsdb.Fsdb(self.DATA_FILE)
        self.assertTrue(f, "opened ok")

        all = []
        for r in f.next_as_array():
            all.append(r)

        self.check_data(all)
示例#27
0
    def close(self):
        "output the results"
        self._in_close = True

        output = self.new_output(0, output_type="match")
        if self._format == "fsdb":
            output = pyfsdb.Fsdb(out_file_handle=output)
            output.out_column_names = ['type', 'key', 'value', 'count']

        for key in self._match_fields:
            for value in self._match_values[key]:
                if self._format == "fsdb":
                    output.append(
                        ['match', key, value, self._match_values[key][value]])
                else:
                    output.write(
                        f"match {key} {value} = {self._match_values[key][value]}\n"
                    )

        # XXX: fix this ugly hack
        if self._format == "fsdb" and not isinstance(self._stream, StringIO):
            output.close()
            self._stream = None
        else:
            self.maybe_close_output(output)

        output = self.new_output(1, output_type="row")
        if self._format == "fsdb":
            output = pyfsdb.Fsdb(out_file_handle=output)
            output.out_column_names = ['type', 'key', 'value', 'count']

        for key in self._row_fields:
            for value in self._row_values[key]:
                if self._format == "fsdb":
                    output.append(
                        ['row', key, value, self._row_values[key][value]])
                else:
                    output.write(
                        f"row {key} {value} = {self._row_values[key][value]}\n"
                    )

        self.maybe_close_output()
        if self._format == "fsdb":
            output.close()
示例#28
0
def main():
    args = parse_args()

    inh = pyfsdb.Fsdb(file_handle=args.input_file,
                      return_type=pyfsdb.RETURN_AS_DICTIONARY)
    outh = args.output_file

    format_string = args.format

    for row in inh:
        outh.write(format_string.format(**row) + "\n")
示例#29
0
    def test_read_all_data(self):
        HEADER_FILE = "pyfsdb/tests/tests.fsdb"
        f = pyfsdb.Fsdb()
        fileh = open(HEADER_FILE, "r")
        data = f.read_fsdb(fileh)

        self.assertTrue(data[0] == 0, 'parsing status is 0')
        self.assertTrue('data' in data[1], 'data is in the output')

        rows = data[1]['data']
        self.check_data(rows)
示例#30
0
    def test_get_pandas(self):
        f = pyfsdb.Fsdb(self.DATA_FILE)
        self.assertTrue(f, "opened ok")

        all = f.get_pandas(usecols=['coltwo'])
        rows = all.values.tolist()
        self.assertTrue(len(rows) == 2)
        self.assertTrue(len(rows[0]) == 1)
        self.assertTrue(len(rows[1]) == 1)
        self.assertTrue(rows[0][0] == "info")
        self.assertTrue(rows[1][0] == "other")