예제 #1
0
def manual_download(captured_sha1):
    util.setup_socks()
    conn = util.connect_to_db()
    cursor = conn.cursor()

    # Database query to get the relevant recent record
    cursor.execute(
        """
        SELECT dump_id,host,url,referer,client,server FROM pe_dumps WHERE sha1 = %s
            ORDER BY timestamp DESC;""", (captured_sha1, ))
    row = cursor.fetchone()
    dump_id = row[0]
    host = row[1]
    url = row[2]
    referer = row[3]
    client = row[4]
    server = row[5]

    full_url = "http://"
    ordered_host = server  # if host is null, we use ther server IP
    if host:
        ordered_host = util.reorder_domain(host)
    full_url += ordered_host
    if url:
        full_url += url
    print "Starting manual download from :", full_url

    # Prepare the urllib2 request
    req = urllib2.Request(full_url)
    req.add_header("User-Agent", USER_AGENT)

    download_time = time.time()
    sha1, md5, different, is_interesting_file = download_file(
        dump_id, req, captured_sha1)

    # Database statement
    cursor.execute(
        """
        INSERT INTO manual_download_checksums(dump_id, sha1,
        md5, different, referer_exists, timestamp, is_pe)
        VALUES (%s, %s, %s, %s, %s, TO_TIMESTAMP(%s), %s)""",
        (dump_id, sha1, md5, different, False, download_time,
         is_interesting_file))

    cursor.close()
    conn.close()
예제 #2
0
def make_syslog_entry(cursor, dump_id, score):
    # Database query to get the relevant record
    cursor.execute("""
        SELECT timestamp, client, server, dst_port, host, url, referer,
            pe.sha1, pe.md5, file_size, num_av_labels, corrupt, file_type
        FROM pe_dumps as pe LEFT JOIN virus_total_scans as vts USING(sha1)
        WHERE (corrupt = 'false' OR num_av_labels IS NOT NULL) AND
            dump_id = %s
        ORDER BY vts.query_time DESC
        """ % (dump_id, ))
    if cursor.rowcount == 0:
        return
    log_data = list(cursor.fetchone())
    log_data[4] = reorder_domain(log_data[4])

    # if a score!=None is passed as argument, use that score, otherwise retrieve it from DB
    report = "-"

    if score is not None:
        score = float(
            score
        )  # just to make sure we are dealing with real numbers and not a string ...
        if score > amico_threshold:
            report = "MALWARE"
        else:
            report = "BENIGN"
        report += "#%s#%s" % (score, amico_threshold)

    log_data.append(report)

    if log_data:
        #print log_data
        entry = (
            "file download -- timestamp: %s, client_ip: %s, server_ip:"
            " %s, server_port: %s, host: %s, url: %s, referrer: %s, sha1: %s, md5:"
            " %s, file_size: %s, av_labels: %s, corrupt: %s, file_type: %s, amico_score: %s"
            % tuple(log_data))
        #     syslog.syslog(syslog.LOG_ALERT,q)
        syslog.syslog(syslog.LOG_WARNING | syslog.LOG_USER, entry)
예제 #3
0
def manual_download(captured_sha1):
    util.setup_socks()
    conn = util.connect_to_db()
    cursor = conn.cursor()

    # Database query to get the relevant recent record
    cursor.execute("""
        SELECT dump_id,host,url,referer,client,server FROM pe_dumps WHERE sha1 = %s
            ORDER BY timestamp DESC;""", (captured_sha1,))
    row = cursor.fetchone()
    dump_id = row[0]
    host = row[1]
    url = row[2]
    referer = row[3]
    client = row[4]
    server = row[5]

    if host is None:
        host = server
    ordered_host = util.reorder_domain(host)
    full_url = "http://" + ordered_host + url
    #print full_url

    # Prepare the urllib2 request
    req = urllib2.Request(full_url)
    req.add_header("User-Agent", USER_AGENT)

    download_time = time.time()
    sha1, md5, different, is_pe = download_file(dump_id, req, captured_sha1)

    # Database statement
    cursor.execute("""
        INSERT INTO manual_download_checksums(dump_id, sha1,
        md5, different, referer_exists, timestamp, is_pe)
        VALUES (%s, %s, %s, %s, %s, TO_TIMESTAMP(%s), %s)""",
        (dump_id, sha1, md5, different, False, download_time, is_pe))

    cursor.close()
    conn.close()
예제 #4
0
def make_syslog_entry(cursor, dump_id):
    # Database query to get the relevant record
    cursor.execute("""
        SELECT timestamp, client, server, dst_port, host, url, referer,
            pe.sha1, pe.md5, file_size, trusted_av_labels, corrupt
        FROM pe_dumps as pe JOIN ped_vts_mapping as pvm USING(dump_id),
            virus_total_scans as vts
        WHERE dump_id = '%s'
        """ % (dump_id, ))
    if cursor.rowcount == 0:
        return
    log_data = list(cursor.fetchone())
    log_data[4] = reorder_domain(log_data[4])

    cursor.execute(
        """
            SELECT score FROM amico_scores
            WHERE dump_id = %s """, (dump_id, ))
    report = "-"
    if cursor.rowcount > 0:
        score = cursor.fetchone()[0]
        if score is not None:
            if score > amico_threshold:
                report = "MALWARE"
            else:
                report = "BENIGN"
            report += "#%s#%s" % (score, amico_threshold)
    log_data.append(report)

    if log_data:
        #print log_data
        entry = (
            "PE file download -- timestamp: %s, client_ip: %s, server_ip:"
            " %s, server_port: %s, host: %s, url: %s, referrer: %s, sha1: %s, md5:"
            " %s, file_size: %s, av_labels: %s, corrupt: %s, amico_score: %s" %
            tuple(log_data))
        #     syslog.syslog(syslog.LOG_ALERT,q)
        syslog.syslog(syslog.LOG_WARNING | syslog.LOG_USER, entry)
예제 #5
0
def make_syslog_entry(cursor, dump_id):
    # Database query to get the relevant record
    cursor.execute("""
        SELECT timestamp, client, server, dst_port, host, url, referer,
            pe.sha1, pe.md5, file_size, trusted_av_labels, corrupt
        FROM pe_dumps as pe JOIN ped_vts_mapping as pvm USING(dump_id),
            virus_total_scans as vts
        WHERE dump_id = '%s'
        """ % (dump_id,))
    if cursor.rowcount == 0:
        return
    log_data = list(cursor.fetchone())
    log_data[4] = reorder_domain(log_data[4])

    cursor.execute("""
            SELECT score FROM amico_scores
            WHERE dump_id = %s """, (dump_id, ))
    report = "-"
    if cursor.rowcount > 0:
        score = cursor.fetchone()[0]
        if score is not None:
            if score > amico_threshold:
                report = "MALWARE"
            else:
                report = "BENIGN"
            report += "#%s#%s" % (score, amico_threshold)
    log_data.append(report)

    if log_data:
        #print log_data
        entry = ("PE file download -- timestamp: %s, client_ip: %s, server_ip:"
        " %s, server_port: %s, host: %s, url: %s, referrer: %s, sha1: %s, md5:"
        " %s, file_size: %s, av_labels: %s, corrupt: %s, amico_score: %s" %
            tuple(log_data))
        #     syslog.syslog(syslog.LOG_ALERT,q)
        syslog.syslog(syslog.LOG_WARNING | syslog.LOG_USER, entry)
예제 #6
0
def db_pe_dumps(file_path, sha1, md5, file_size):
    # print "Time b4 http parsing: %f" %(time.time(),)
    # Use Autocommit mode for database connection
    conn = util.connect_to_db()
    cursor = conn.cursor()

    fileHandle = open(file_path)

    # Timestamp
    r = re.compile("[0-9]+")
    timestamp = r.search(fileHandle.readline())
    if timestamp is not None:
        timestamp = timestamp.group()
        # print timestamp.group()

    # Source and Destination IPs
    r = re.compile("([0-9.]+):.*-([0-9.]+):([0-9]+)-.*")
    ip = r.search(fileHandle.readline())
    if ip is not None:
        srcip = ip.group(2)
        dstip = ip.group(1)
        dst_port = ip.group(3)
        # print ip.group(1)
        # print ip.group(2)
    else:
        srcip = None
        dstip = None
        dst_port = None

    # URL
    r = re.compile("(GET|POST|HEAD) (.*) ")
    url = r.search(fileHandle.readline())
    if url is not None:
        method = url.group(1)
        method = method[:10]
        url = url.group(2)
        # print url.group(1)
    else:
        method = None

    # Host
    r = re.compile("Host: (.*)")
    host = r.search(fileHandle.readline())
    if host is not None:
        host = host.group(1)
        host = util.reorder_domain(host.strip())
        # print host.group(1)

    # Referer
    r = re.compile("Referer: (.*)")
    referer = r.search(fileHandle.readline())
    if referer is not None:
        referer = referer.group(1)
        # print referrer.group(1)

    # CORRUPT_PE
    corrupt_pe = False
    r = re.compile("CORRUPT_(PE|FILE)")
    corrupt_pe_str = r.search(fileHandle.readline())
    if corrupt_pe_str is not None:
        corrupt_pe = True

    # Now, parse data from the response
    # Server
    data = fileHandle.read()
    r = re.compile("Server: (.*)")
    server = r.search(data)
    if server is not None:
        server = server.group(1)
        server = server.rstrip("\r")
        server = server[:64]

    # Content-Type
    r = re.compile("Content-Type: (.*)")
    cont_type = r.search(data)
    if cont_type is not None:
        cont_type = cont_type.group(1)
        cont_type = cont_type.rstrip("\r")
        cont_type = cont_type[:128]

    # print "Time after http parsing: %f" %(time.time(),)
    # Database statement
    cursor.execute(
        """
        INSERT INTO pe_dumps(sha1,md5,timestamp,server,client,method,url,host,
        referer,server_application,content_type,dst_port,corrupt,file_size)
        VALUES
        (%s,%s,TO_TIMESTAMP(%s),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""",
        (
            sha1,
            md5,
            timestamp,
            srcip,
            dstip,
            method,
            url,
            host,
            referer,
            server,
            cont_type,
            dst_port,
            corrupt_pe,
            file_size,
        ),
    )
    cursor.execute(
        """
        SELECT dump_id FROM pe_dumps where sha1 = %s ORDER BY dump_id DESC
        """,
        (sha1,),
    )
    dump_id = cursor.fetchone()[0]
    print("A new entry on host:%s has been made in pe_dumps table with " "dump_id %s" % (host, dump_id))

    fileHandle.close()
    cursor.close()
    conn.close()
    return dump_id, corrupt_pe
예제 #7
0
def insert_twold_based_features(cursor, dump_id):
    cursor.execute(
        """
           SELECT host FROM pe_dumps where
           dump_id = %s""", (dump_id, ))
    row = cursor.fetchone()
    try:
        # ok because AND clauses are evaluated left to right
        if row is not None and row[0]:
            host = util.reorder_domain(row[0])
            twold = util.extract_twold(host)
            twold = util.reorder_domain(twold)
            twold += '%'
        else:
            print "host is None!"
            return
    except Exception as e:
        # capturing known causes
        if util.is_ip(host):
            twold = row[0]
        else:
            print "Error in extracting 2LD!, ", e, host, dump_id
            return

    cursor.execute(
        """
        SELECT COUNT(DISTINCT dump_id)
        FROM pe_dumps AS pe
        WHERE pe.host LIKE %s AND
            pe.dump_id < %s AND pe.dump_id > %s""",
        (twold, dump_id, dump_id - MAX_PAST_DUMPS))
    twold_total_downloads = cursor.fetchone()[0]

    # Disabled vt_month_shelf due to the 403 error from VT
    #cursor.execute("""
    #    SELECT count(distinct dump_id) from pe_dumps as pe JOIN
    #    weka_features as f using (dump_id)
    #    where f.raw_dump_num_av_labels = 0 and f.vt_month_shelf = 't' and
    #    pe.host like %s and pe.dump_id < %s """,
    #    (twold, dump_id))
    cursor.execute(
        """
        SELECT COUNT(DISTINCT dump_id)
        FROM pe_dumps AS pe JOIN
            ped_vts_mapping AS pvm USING (dump_id),
            virus_total_scans AS vts
        WHERE vts.num_av_labels = 0 AND
            pe.host LIKE %s AND
            pe.dump_id < %s AND pe.dump_id > %s AND
            vts.vt_id = pvm.vt_id""",
        (twold, dump_id, dump_id - MAX_PAST_DUMPS))
    twold_benign_downloads = cursor.fetchone()[0]

    cursor.execute(
        """
        SELECT COUNT(DISTINCT dump_id)
        FROM pe_dumps AS pe JOIN
            ped_vts_mapping AS pvm USING (dump_id),
            virus_total_scans AS vts
        WHERE vts.trusted_av_labels > 1 AND
            pe.host LIKE %s AND
            pe.dump_id < %s AND pe.dump_id > %s AND
            vts.vt_id = pvm.vt_id""",
        (twold, dump_id, dump_id - MAX_PAST_DUMPS))
    twold_malware_downloads = cursor.fetchone()[0]

    cursor.execute(
        """
        SELECT COUNT(DISTINCT dump_id)
        FROM pe_dumps AS pe JOIN
            ped_vts_mapping AS pvm USING (dump_id),
            virus_total_scans AS vts
        WHERE vts.num_av_labels > 1 AND
            pe.host LIKE %s AND
            pe.dump_id < %s AND pe.dump_id > %s AND
            vts.vt_id = pvm.vt_id""",
        (twold, dump_id, dump_id - MAX_PAST_DUMPS))
    twold_suspicious_downloads = cursor.fetchone()[0]

    if twold_total_downloads == 0:
        twold_benign_ratio = None
        twold_malware_ratio = None
        twold_suspicious_ratio = None
    else:
        twold_benign_ratio = float(
            twold_benign_downloads) / twold_total_downloads
        twold_malware_ratio = float(
            twold_malware_downloads) / twold_total_downloads
        twold_suspicious_ratio = float(
            twold_suspicious_downloads) / twold_total_downloads

    # The averages are over distinct sha1s
    cursor.execute(
        """
        SELECT AVG(num_av_labels), AVG(trusted_av_labels)
        FROM
            (SELECT pe.sha1, MAX(dump_id) AS max_id
            FROM pe_dumps AS pe
            WHERE pe.host LIKE %s AND
                pe.dump_id < %s AND pe.dump_id > %s AND
                pe.corrupt = 'f' GROUP BY pe.sha1) as a
            JOIN
            (SELECT p.sha1, num_av_labels, trusted_av_labels, dump_id
            FROM pe_dumps AS p JOIN
                ped_vts_mapping as pvm USING (dump_id),
                virus_total_scans as vts
            WHERE pvm.vt_id = vts.vt_id AND
                p.host LIKE %s AND
                dump_id < %s AND dump_id > %s AND
                p.corrupt='f') as b
            ON a.max_id = b.dump_id
        WHERE num_av_labels IS NOT NULL""",
        (twold, dump_id, dump_id - MAX_PAST_DUMPS, twold, dump_id,
         dump_id - MAX_PAST_DUMPS))
    if cursor.rowcount == 0:
        twold_avg_av_labels = None
        twold_avg_trusted_labels = None
    else:
        twold_avg_av_labels, twold_avg_trusted_labels = cursor.fetchone()

    # the oldest scan report is used to get the # of unknown hashes
    # to remove any bias due to VT submissions
    cursor.execute(
        """
        SELECT COUNT(DISTINCT b.sha1)
        FROM
            (SELECT pe.sha1, MIN(dump_id) AS min_id
            FROM pe_dumps AS pe
            WHERE pe.host LIKE %s AND
                pe.dump_id < %s AND pe.dump_id > %s AND
                pe.corrupt = 'f' GROUP BY pe.sha1) as a
            JOIN
            (SELECT p.sha1, num_av_labels, trusted_av_labels, dump_id
            FROM pe_dumps AS p JOIN
                ped_vts_mapping as pvm USING (dump_id),
                virus_total_scans as vts
            WHERE pvm.vt_id = vts.vt_id AND
                p.host LIKE %s AND
                dump_id < %s AND dump_id > %s AND
                p.corrupt='f') as b
            ON a.min_id = b.dump_id
        WHERE num_av_labels IS NULL""",
        (twold, dump_id, dump_id - MAX_PAST_DUMPS, twold, dump_id,
         dump_id - MAX_PAST_DUMPS))
    twold_unknown_hashes = cursor.fetchone()[0]

    cursor.execute(
        """
        SELECT COUNT(DISTINCT pe.sha1)
        FROM pe_dumps AS pe
        WHERE pe.host LIKE %s AND
            pe.corrupt = 'f' AND
            pe.dump_id < %s AND pe.dump_id > %s """,
        (twold, dump_id, dump_id - MAX_PAST_DUMPS))
    twold_total_hashes = cursor.fetchone()[0]
    if twold_total_hashes != 0:
        twold_unknown_hash_ratio = float(
            twold_unknown_hashes) / twold_total_hashes
    else:
        twold_unknown_hash_ratio = None

    try:
        cursor.execute(
            """
                UPDATE weka_features set twold_benign_downloads = %s,
                 twold_malware_downloads = %s,
                 twold_suspicious_downloads = %s,
                 twold_total_downloads = %s,
                 twold_malware_ratio = %s,
                 twold_suspicious_ratio = %s,
                 twold_benign_ratio = %s,
                 twold_avg_av_labels = %s,
                 twold_avg_trusted_labels = %s,
                 twold_unknown_hashes = %s,
                 twold_total_hashes = %s,
                 twold_unknown_hash_ratio = %s
                 where dump_id = %s """,
            (twold_benign_downloads, twold_malware_downloads,
             twold_suspicious_downloads, twold_total_downloads,
             twold_malware_ratio, twold_suspicious_ratio, twold_benign_ratio,
             twold_avg_av_labels, twold_avg_trusted_labels,
             twold_unknown_hashes, twold_total_hashes,
             twold_unknown_hash_ratio, dump_id))
    except Exception as e:
        print e
        print "Could not insert twold based features for the dump #", dump_id
예제 #8
0
def insert_hts_based_features(cursor, dump_id):
    """ Computes host/2ld/server-based features for a given download
    
    Arguments:
        cursor: DB cursort from existing DB connection
        dump_id: id of download to be classified 

    """

    # also query for timestamp, so we can use to limit how much we go back in time!
    query = " SELECT host,server,DATE(timestamp) FROM pe_dumps WHERE dump_id = %s "

    cursor.execute(query, (dump_id, ))
    row = cursor.fetchone()
    if not row:
        return

    (host, server, date) = row
    domain = util.reorder_domain(host)
    twold = util.reorder_domain(util.extract_twold(domain))
    twold_like = '-NONE-'  # avoids any matching in "pe.host LIKE %s" in the query below
    if twold is None:
        if not host is None:
            twold = host
            twold_like = twold + '.%'

    query = """
        SELECT dump_id,pe.sha1,pe.host,pe.server,trusted_av_labels,num_av_labels
        FROM pe_dumps AS pe 
        JOIN ped_vts_mapping AS pvm 
          USING(dump_id)
        JOIN virus_total_scans AS vts
          USING(vt_id)
        WHERE pe.corrupt = 'f' AND
             (pe.host = %s OR pe.host LIKE %s OR pe.server = %s) AND
              pe.dump_id < %s AND pe.dump_id > %s AND
              pe.timestamp > %s """

    cursor.execute(query,
                   (host, twold_like, server, dump_id, dump_id -
                    MAX_PAST_DUMPS, date - timedelta(days=MAX_PAST_DAYS)))
    tuples = cursor.fetchall()

    # make the results into a pandas data frame
    if not tuples:
        df = ps.DataFrame(
            index=[],
            columns=['dump_id', 'sha1', 'host', 'server', 'tavs', 'navs'])
    else:
        df = ps.DataFrame.from_records(tuples)
        df.columns = ['dump_id', 'sha1', 'host', 'server', 'tavs', 'navs']

    ### compute twold-based features
    df_twold = df[df['host'].str.startswith(twold) == True]
    twold_v = compute_features_hts(df_twold)

    ### compute host-based features
    df_host = df[df.host == host]
    host_v = compute_features_hts(df_host)

    ### compute server-based features
    df_server = df[df.server == server]
    server_v = compute_features_hts(df_server)

    twold_features = (twold_v['benign_downloads'],
                      twold_v['malware_downloads'],
                      twold_v['suspicious_downloads'],
                      twold_v['total_downloads'], twold_v['malware_ratio'],
                      twold_v['suspicious_ratio'], twold_v['benign_ratio'],
                      twold_v['avg_av_labels'], twold_v['avg_trusted_labels'],
                      twold_v['unknown_hashes'], twold_v['total_hashes'],
                      twold_v['unknown_hash_ratio'])

    host_features = (host_v['benign_downloads'], host_v['malware_downloads'],
                     host_v['suspicious_downloads'], host_v['total_downloads'],
                     host_v['malware_ratio'], host_v['suspicious_ratio'],
                     host_v['benign_ratio'], host_v['avg_av_labels'],
                     host_v['avg_trusted_labels'], host_v['unknown_hashes'],
                     host_v['total_hashes'], host_v['unknown_hash_ratio'])

    server_features = (server_v['benign_downloads'],
                       server_v['malware_downloads'],
                       server_v['suspicious_downloads'],
                       server_v['total_downloads'], server_v['malware_ratio'],
                       server_v['suspicious_ratio'], server_v['benign_ratio'],
                       server_v['avg_av_labels'],
                       server_v['avg_trusted_labels'],
                       server_v['unknown_hashes'], server_v['total_hashes'],
                       server_v['unknown_hash_ratio'])

    query = """ UPDATE weka_features SET
        
        twold_benign_downloads = %s,
        twold_malware_downloads = %s,
        twold_suspicious_downloads = %s,
        twold_total_downloads = %s,
        twold_malware_ratio = %s,
        twold_suspicious_ratio = %s,
        twold_benign_ratio = %s,
        twold_avg_av_labels = %s,
        twold_avg_trusted_labels = %s,
        twold_unknown_hashes = %s,
        twold_total_hashes = %s,
        twold_unknown_hash_ratio = %s,
        
        host_benign_downloads = %s,
        host_malware_downloads = %s,
        host_suspicious_downloads = %s,
        host_total_downloads = %s,
        host_malware_ratio = %s,
        host_suspicious_ratio = %s,
        host_benign_ratio = %s,
        host_avg_av_labels = %s,
        host_avg_trusted_labels = %s,
        host_unknown_hashes = %s,
        host_total_hashes = %s,
        host_unknown_hash_ratio = %s,
        
        server_ip_benign_downloads = %s,
        server_ip_malware_downloads = %s,
        server_ip_suspicious_downloads = %s,
        server_ip_total_downloads = %s,
        server_ip_malware_ratio = %s,
        server_ip_suspicious_ratio = %s,
        server_ip_benign_ratio = %s,
        server_ip_avg_av_labels = %s,
        server_ip_avg_trusted_labels = %s,
        server_ip_unknown_hashes = %s,
        server_ip_total_hashes = %s,
        server_ip_unknown_hash_ratio = %s
        
        where dump_id = %s """

    try:
        cursor.execute(
            query,
            twold_features + host_features + server_features + (dump_id, ))
    except Exception as e:
        print e
        print "Could not insert server-based features for the dump #", dump_id
def insert_twold_based_features(cursor, dump_id):
    cursor.execute("""
           SELECT host FROM pe_dumps where
           dump_id = %s""", (dump_id, ))
    row = cursor.fetchone()
    try:
        # ok because AND clauses are evaluated left to right
        if row is not None and row[0]:
            host = util.reorder_domain(row[0])
            twold = util.extract_twold(host)
            twold = util.reorder_domain(twold)
            twold += '%'
        else:
            print "host is None!"
            return
    except Exception as e:
        # capturing known causes
        if util.is_ip(host):
            twold = row[0]
        else:
            print "Error in extracting 2LD!, ", e, host, dump_id
            return

    cursor.execute("""
        SELECT COUNT(DISTINCT dump_id)
        FROM pe_dumps AS pe
        WHERE pe.host LIKE %s AND
            pe.dump_id < %s AND pe.dump_id > %s""",
        (twold, dump_id, dump_id-MAX_PAST_DUMPS))
    twold_total_downloads = cursor.fetchone()[0]

    # Disabled vt_month_shelf due to the 403 error from VT
    #cursor.execute("""
    #    SELECT count(distinct dump_id) from pe_dumps as pe JOIN 
    #    weka_features as f using (dump_id)
    #    where f.raw_dump_num_av_labels = 0 and f.vt_month_shelf = 't' and 
    #    pe.host like %s and pe.dump_id < %s """,
    #    (twold, dump_id))
    cursor.execute("""
        SELECT COUNT(DISTINCT dump_id)
        FROM pe_dumps AS pe JOIN
            ped_vts_mapping AS pvm USING (dump_id),
            virus_total_scans AS vts
        WHERE vts.num_av_labels = 0 AND
            pe.host LIKE %s AND
            pe.dump_id < %s AND pe.dump_id > %s AND
            vts.vt_id = pvm.vt_id""",
        (twold, dump_id, dump_id-MAX_PAST_DUMPS))
    twold_benign_downloads = cursor.fetchone()[0]

    cursor.execute("""
        SELECT COUNT(DISTINCT dump_id)
        FROM pe_dumps AS pe JOIN
            ped_vts_mapping AS pvm USING (dump_id),
            virus_total_scans AS vts
        WHERE vts.trusted_av_labels > 1 AND
            pe.host LIKE %s AND
            pe.dump_id < %s AND pe.dump_id > %s AND
            vts.vt_id = pvm.vt_id""",
        (twold, dump_id, dump_id-MAX_PAST_DUMPS))
    twold_malware_downloads = cursor.fetchone()[0]

    cursor.execute("""
        SELECT COUNT(DISTINCT dump_id)
        FROM pe_dumps AS pe JOIN
            ped_vts_mapping AS pvm USING (dump_id),
            virus_total_scans AS vts
        WHERE vts.num_av_labels > 1 AND
            pe.host LIKE %s AND
            pe.dump_id < %s AND pe.dump_id > %s AND
            vts.vt_id = pvm.vt_id""",
        (twold, dump_id, dump_id-MAX_PAST_DUMPS))
    twold_suspicious_downloads = cursor.fetchone()[0]

    if twold_total_downloads == 0:
        twold_benign_ratio = None
        twold_malware_ratio = None
        twold_suspicious_ratio = None
    else:
        twold_benign_ratio = float(twold_benign_downloads) / twold_total_downloads
        twold_malware_ratio = float(twold_malware_downloads) / twold_total_downloads
        twold_suspicious_ratio = float(twold_suspicious_downloads) / twold_total_downloads

    # The averages are over distinct sha1s
    cursor.execute("""
        SELECT AVG(num_av_labels), AVG(trusted_av_labels)
        FROM
            (SELECT pe.sha1, MAX(dump_id) AS max_id
            FROM pe_dumps AS pe
            WHERE pe.host LIKE %s AND
                pe.dump_id < %s AND pe.dump_id > %s AND
                pe.corrupt = 'f' GROUP BY pe.sha1) as a
            JOIN
            (SELECT p.sha1, num_av_labels, trusted_av_labels, dump_id
            FROM pe_dumps AS p JOIN
                ped_vts_mapping as pvm USING (dump_id),
                virus_total_scans as vts
            WHERE pvm.vt_id = vts.vt_id AND
                p.host LIKE %s AND
                dump_id < %s AND dump_id > %s AND
                p.corrupt='f') as b
            ON a.max_id = b.dump_id
        WHERE num_av_labels IS NOT NULL""",
    (twold, dump_id, dump_id-MAX_PAST_DUMPS, twold, dump_id, dump_id-MAX_PAST_DUMPS))
    if cursor.rowcount == 0:
        twold_avg_av_labels = None
        twold_avg_trusted_labels = None
    else:
        twold_avg_av_labels, twold_avg_trusted_labels = cursor.fetchone()


    # the oldest scan report is used to get the # of unknown hashes
    # to remove any bias due to VT submissions
    cursor.execute("""
        SELECT COUNT(DISTINCT b.sha1)
        FROM
            (SELECT pe.sha1, MIN(dump_id) AS min_id
            FROM pe_dumps AS pe
            WHERE pe.host LIKE %s AND
                pe.dump_id < %s AND pe.dump_id > %s AND
                pe.corrupt = 'f' GROUP BY pe.sha1) as a
            JOIN
            (SELECT p.sha1, num_av_labels, trusted_av_labels, dump_id
            FROM pe_dumps AS p JOIN
                ped_vts_mapping as pvm USING (dump_id),
                virus_total_scans as vts
            WHERE pvm.vt_id = vts.vt_id AND
                p.host LIKE %s AND
                dump_id < %s AND dump_id > %s AND
                p.corrupt='f') as b
            ON a.min_id = b.dump_id
        WHERE num_av_labels IS NULL""",
        (twold, dump_id, dump_id-MAX_PAST_DUMPS, twold, dump_id, dump_id-MAX_PAST_DUMPS))
    twold_unknown_hashes = cursor.fetchone()[0]

    cursor.execute("""
        SELECT COUNT(DISTINCT pe.sha1)
        FROM pe_dumps AS pe
        WHERE pe.host LIKE %s AND
            pe.corrupt = 'f' AND
            pe.dump_id < %s AND pe.dump_id > %s """,
        (twold, dump_id, dump_id-MAX_PAST_DUMPS))
    twold_total_hashes = cursor.fetchone()[0]
    if twold_total_hashes != 0:
        twold_unknown_hash_ratio = float(twold_unknown_hashes) / twold_total_hashes
    else:
        twold_unknown_hash_ratio = None

    try:
        cursor.execute("""
                UPDATE weka_features set twold_benign_downloads = %s,
                 twold_malware_downloads = %s,
                 twold_suspicious_downloads = %s,
                 twold_total_downloads = %s,
                 twold_malware_ratio = %s,
                 twold_suspicious_ratio = %s,
                 twold_benign_ratio = %s,
                 twold_avg_av_labels = %s,
                 twold_avg_trusted_labels = %s,
                 twold_unknown_hashes = %s,
                 twold_total_hashes = %s,
                 twold_unknown_hash_ratio = %s
                 where dump_id = %s """,
                (twold_benign_downloads, twold_malware_downloads, 
                 twold_suspicious_downloads,
                 twold_total_downloads, twold_malware_ratio,
                 twold_suspicious_ratio,
                 twold_benign_ratio, 
                 twold_avg_av_labels, twold_avg_trusted_labels,
                 twold_unknown_hashes, twold_total_hashes, 
                 twold_unknown_hash_ratio, dump_id))
    except Exception as e:
        print e
        print "Could not insert twold based features for the dump #", dump_id
예제 #10
0
def db_file_dumps(file_path, sha1, md5, file_size, file_type):
    #print "Time b4 http parsing: %f" %(time.time(),)
    # Use Autocommit mode for database connection
    conn = util.connect_to_db()
    cursor = conn.cursor()

    fileHandle = open(file_path)

    # Timestamp
    r = re.compile('[0-9]+')
    timestamp = r.search(fileHandle.readline())
    if timestamp is not None:
        timestamp = timestamp.group()
        #print timestamp.group()

    # Source and Destination IPs
    r = re.compile('([0-9.]+):.*-([0-9.]+):([0-9]+)-.*')
    ip = r.search(fileHandle.readline())
    if ip is not None:
        srcip = ip.group(2)
        dstip = ip.group(1)
        dst_port = ip.group(3)
        #print ip.group(1)
        #print ip.group(2)
    else:
        srcip = None
        dstip = None
        dst_port = None

    # URL
    r = re.compile('(GET|POST|HEAD) (.*) ')
    url = r.search(fileHandle.readline())
    if url is not None:
        method = url.group(1)
        method = method[:10]
        url = url.group(2)
        #print url.group(1)
    else:
        method = None

    # Host
    r = re.compile('Host: (.*)')
    host = r.search(fileHandle.readline())
    if host is not None:
        host = host.group(1)
        host = util.reorder_domain(host.strip())
        #print host.group(1)

    # Referer
    r = re.compile('Referer: (.*)')
    referer = r.search(fileHandle.readline())
    if referer is not None:
        referer = referer.group(1)
        #print referrer.group(1)

    # CORRUPT_PE
    corrupt_pe = False
    r = re.compile('CORRUPT_FILE')
    corrupt_pe_str = r.search(fileHandle.readline())
    if corrupt_pe_str is not None:
        corrupt_pe = True

    # Now, parse data from the response
    # Server
    data = fileHandle.read()
    r = re.compile('Server: (.*)')
    server = r.search(data)
    if server is not None:
        server = server.group(1)
        server = server.rstrip('\r')
        server = server[:64]

    # Content-Type
    r = re.compile('Content-Type: (.*)')
    cont_type = r.search(data)
    if cont_type is not None:
        cont_type = cont_type.group(1)
        cont_type = cont_type.rstrip('\r')
        cont_type = cont_type[:128]

    #print "Time after http parsing: %f" %(time.time(),)
    # Database statement
    cursor.execute(
        """
        INSERT INTO pe_dumps(sha1,md5,timestamp,server,client,method,url,host,
        referer,server_application,content_type,dst_port,corrupt,file_size,file_type)
        VALUES
        (%s,%s,TO_TIMESTAMP(%s),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""",
        (sha1, md5, timestamp, srcip, dstip, method, url, host, referer,
         server, cont_type, dst_port, corrupt_pe, file_size, file_type))
    cursor.execute(
        """
        SELECT dump_id FROM pe_dumps where sha1 = %s ORDER BY dump_id DESC LIMIT 1
        """, (sha1, ))
    dump_id = cursor.fetchone()[0]
    print("A new entry on host:%s has been made in pe_dumps table with "
          "dump_id %s" % (host, dump_id))

    fileHandle.close()
    cursor.close()
    conn.close()
    return dump_id, corrupt_pe, host, dstip, srcip