Пример #1
0
def get_valid_java_dir_path():
    decompress_dir_paths = utils.get_files_from_dir(
        config.DATA_MALWARE_DECOMPRESS_PATH)
    print("Total get {0} decompress dir paths".format(
        len(decompress_dir_paths)))

    java_dir_paths = utils.get_files_from_dir(
        config.DATA_MALWARE_DECOMPILE_JAVA_PATH)
    java_dir_paths = list(x.split("\\")[-1] for x in java_dir_paths)
    print("Total get {0} java dir paths".format(len(java_dir_paths)))

    res = []
    count = 0
    for decompress_dir_path in decompress_dir_paths:
        if decompress_dir_path.split("\\")[-1] in java_dir_paths:
            count += 1
            if count % 100 == 0:
                print(count, " pass")
            res.append(decompress_dir_path)

    print("Total get {0} decompress dir paths".format(
        len(decompress_dir_paths)))
    print("Total get {0} java dir paths".format(len(java_dir_paths)))
    print("{0} files pass the filter".format(count))

    return res
Пример #2
0
def update_db(dataset_dir_path, decompress_dir_path):
    origin_files = utils.get_files_from_dir(dataset_dir_path)
    print("Get {0} origin files from dataset".format(len(origin_files)))

    decompress_subdirs = utils.get_files_from_dir(decompress_dir_path)
    if DEVICE == 'M':
        decompress_files = list(x.split("\\")[-1] for x in decompress_subdirs)
    else:
        pass

    print("Get {0} decompress subdir paths".format(len(decompress_files)))
    print("{0} files occur error in decompress process".format(
        len(origin_files) - len(decompress_files)))

    if DEVICE == "M":
        print("start insert malware file info that not decompressed correctly")
        for file in origin_files:
            filename = file.split("\\")[-1]
            if filename not in decompress_files:
                md5 = filename.split("_")[-1]
                valid = "decompress error;"
                sql = "INSERT INTO malware_info(MD5,VALID) VALUES(%s,%s)"
                print("Error in decompress:", filename)
                mysql = MySQLUtils.MyPymysqlPool()
                mysql.insert(sql, (md5, valid))
                mysql.dispose()
    else:
        pass
Пример #3
0
def get_malwares(dir_path, num_limit=2):
    sub_dirs = utils.get_files_from_dir(dir_path)

    res = {}
    for sub_dir in sub_dirs:
        files = utils.get_files_from_dir(sub_dir, file_type=".apk")
        if len(files) >= num_limit:
            dir_name = sub_dir.split("\\")[-1]
            res[dir_name] = files
    return res
Пример #4
0
    def __init__(self, split, img_size, tag, **kwargs):
        self.data_path = coerce_to_path_and_check_exist(
            self.root / self.name / tag) / split
        self.split = split
        self.tag = tag
        try:
            input_files = get_files_from_dir(self.data_path,
                                             IMG_EXTENSIONS,
                                             sort=True)
        except FileNotFoundError:
            input_files = []
        self.input_files = input_files
        self.labels = [-1] * len(input_files)
        self.n_classes = 0
        self.size = len(self.input_files)

        if isinstance(img_size, int):
            self.img_size = (img_size, img_size)
            self.crop = True
        else:
            assert len(img_size) == 2
            self.img_size = img_size
            self.crop = False

        if self.size > 0:
            sample_size = Image.open(self.input_files[0]).size
            if min(self.img_size) > min(sample_size):
                raise ValueError(
                    "img_size too big compared to a sampled image size, adjust it or upscale dataset"
                )
Пример #5
0
def update_db(src_md5, des_dir_path, mysql):
    print("Get {0} src files".format(len(src_md5)))

    des_files = utils.get_files_from_dir(des_dir_path)
    print("Get {0} des files from {1}".format(len(des_files), des_dir_path))

    if DEVICE == 'M':
        des_md5 = list(x.split("\\")[-1].split("_")[-1] for x in des_files)
    else:
        pass

    print("{0} files occur error in decompile {1} process".format(
        len(src_md5) - len(des_files), DECOMPILE_TYPE))

    if DEVICE == "M":
        print("start update db")
        for md5 in src_md5:
            if md5 not in des_md5:
                valid = "decompile {0} error;".format(DECOMPILE_TYPE)
                sql = "UPDATE malware_info SET VALID=%s WHERE MD5=%s"
                print(sql % (valid, md5))
                mysql.update(sql, (valid, md5))
            else:
                decompile_path = os.path.join(des_dir_path,
                                              "VirusShare_" + md5)
                sql = "UPDATE malware_info SET DECOMPILE_{0}_DIR_PATH=%s WHERE MD5=%s".format(
                    DECOMPILE_TYPE)
                print(sql % (decompile_path, md5))
                mysql.update(sql, (decompile_path, md5))

    else:
        pass
Пример #6
0
 def _initialize_table(self):
     table = dict()
     for name in AVAILABLE_RESRC_NAMES:
         p, ext = self.input_dir / name, VALID_EXTENSIONS[name]
         if name == FONT_RESRC_NAME:
             d = {}
             for font in FONT_TYPES:
                 files = get_files_from_dir(p / font,
                                            valid_extensions=ext,
                                            recursive=True)
                 d[font] = list(map(str, files))
             table[name] = d
         else:
             files = get_files_from_dir(p,
                                        valid_extensions=ext,
                                        recursive=True)
             table[name] = list(map(str, files))
     return table
Пример #7
0
def get_valid_decompress_dir_path():
    dir_paths = utils.get_files_from_dir(config.DATA_BENIGN_DECOMPRESS_PATH)
    print("Total get {0} dir paths".format(len(dir_paths)))

    res=[]
    count = 0
    for dir_path in dir_paths:
        dex_files = utils.get_files_from_dir(dir_path,file_type=".dex")

        if len(dex_files) == 1:
            count += 1
            if count % 100 == 0:
                print(count," pass")
            res.append(dir_path)

    print("Total get {0} dir paths".format(len(dir_paths)))
    print("{0} files pass the filter".format(count))

    return res
Пример #8
0
 def __init__(self,
              input_dir,
              output_dir,
              color_label_mapping=COLOR_TO_LABEL_MAPPING,
              img_extension='png',
              verbose=True):
     self.input_dir = coerce_to_path_and_check_exist(input_dir)
     self.files = get_files_from_dir(self.input_dir,
                                     valid_extensions=img_extension)
     self.output_dir = coerce_to_path_and_create_dir(output_dir)
     self.color_label_mapping = color_label_mapping
     self.verbose = verbose
Пример #9
0
def filter_by_dex(decompress_dir_path):
    """
    按两条规则:1. 无dex文件;有多个dex文件
    :param dir_path:
    :return:
    """

    dir_paths = utils.get_files_from_dir(decompress_dir_path)

    values=[]
    count = 0
    for dir_path in dir_paths:
        dex_files = utils.get_files_from_dir(dir_path,file_type=".dex")

        MD5 = dir_path.split("\\")[-1].split("_")[-1]

        valid = ''
        if len(dex_files) == 1:
            valid = '1'
            count += 1
            print("pass ",dir_path)
        elif len(dex_files) == 0:
            valid = "no dex file;"
            print(valid, dir_path)
        elif len(dex_files) > 1:
            valid = "multiple dex file;"
            print(valid, dir_path)

        values.append([MD5,valid,dir_path])

    print("Total get {0} dir paths".format(len(dir_paths)))
    print("{0} files pass the filter".format(count))
    #print(values)

    print("start saving to database")
    mysql = MySQLUtils.MyPymysqlPool()
    insert_cmd = "INSERT INTO malware_info (MD5,VALID,DECOMPRESS_DIR_PATH) VALUES(%s,%s,%s)"
    mysql.insertMany(insert_cmd, values)
    mysql.dispose()
Пример #10
0
def get_segment_set_of(dirpath, train_set_path):
    """
    :param dirpath: path to directory.
    :param train_set_path: path to trainLabels.csv .
    :return: set of segments-names extracted from all the files in the given directory.
    """
    seg_set = set()
    train_set = utils.read_csv(train_set_path, 'Id', 'Class').viewkeys()
    # segments from .asm files
    ASM_END = utils.ASM_END
    asm_files = utils.get_files_from_dir(dirpath, '.' +
                                         ASM_END)  # get list of .asm files
    for asm_f in asm_files:
        full_path = dirpath + '/' + asm_f
        if full_path in train_set:
            with open('%s.%s' % (full_path, ASM_END)) as f:
                for line in f:
                    segment_name = line.split(':', 1)[0]
                    seg_set.add(segment_name.rstrip('\x00'))

    # segments from .dll files
    DLL_END = utils.DLL_END
    # TODO in ASAFIS the dll_files list is empty because the .bytes and .dll files are in different dirs,
    # TODO thus the dirpath here is of the .bytes dir but needed .dll dirpath
    dll_files = utils.get_files_from_dir(dirpath, '.' +
                                         DLL_END)  # get list of .dll files
    for dll_f in dll_files:
        full_path = dirpath + '/' + dll_f
        if full_path in train_set:
            try:
                pe = pefile.PE('%s.%s' % (full_path, DLL_END))
            except Exception as e:
                print 'Error with pefile on file: %s' % dll_f
                print e.message
                continue
            for section in pe.sections:
                seg_set.add(section.Name.rstrip('\x00'))
    return seg_set
Пример #11
0
    def _get_input_label_files(self):
        input_files = get_files_from_dir(self.data_path, INPUT_EXTENSIONS, sort=True)
        label_files = get_files_from_dir(self.data_path, [LABEL_EXTENSION])

        if len(label_files) == 0 and self.split == 'test':
            return input_files, None
        elif len(input_files) != len(label_files):
            raise RuntimeError("The number of inputs and labels don't match")

        if len(input_files) < 1e5:
            inputs = [p.stem for p in input_files]
            labels = [str(p.name) for p in label_files]
            invalid = []
            for name in inputs:
                if SEG_GROUND_TRUTH_FMT.format(name, LABEL_EXTENSION) not in labels:
                    invalid.append(name)
            if len(invalid) > 0:
                raise FileNotFoundError("Some inputs don't have corresponding labels: {}".format(' '.join(invalid)))
        else:
            assert len(input_files) == len(label_files)

        label_files = [path.parent / SEG_GROUND_TRUTH_FMT.format(path.stem, LABEL_EXTENSION) for path in input_files]
        return input_files, label_files
Пример #12
0
def do_something(dirpath, ending):
    benign_files = utils.get_files_from_dir(dirpath, ending)
    some_file = benign_files[0]
    print some_file

    pe = pefile.PE('%s/%s%s' % (dirpath, some_file, ending))

    md = Cs(CS_ARCH_X86, CS_MODE_64)
    for section in pe.sections[:2]:
        # print section.Name, section.SizeOfRawData, '\n'
        # section attr - VirtualAddress, PointerToRawData
        code = section.get_data()
        first_instruction_address = section.PointerToRawData
        for i in md.disasm(code, first_instruction_address):
            print '0x%x:\t%s\t%s' % (i.address, i.mnemonic, i.op_str)
        print '\n'
Пример #13
0
 def __init__(self,
              input_dir,
              output_dir,
              suffix_fmt='-{}',
              out_ext='jpg',
              create_sub_dir=False,
              verbose=True):
     self.input_dir = coerce_to_path_and_check_exist(input_dir)
     self.files = get_files_from_dir(self.input_dir, valid_extensions='pdf')
     self.output_dir = coerce_to_path_and_create_dir(output_dir)
     self.suffix_fmt = suffix_fmt
     self.out_ext = out_ext
     self.create_sub_dir = create_sub_dir
     self.verbose = verbose
     if self.verbose:
         print_info("Pdf2Image initialised: found {} files".format(
             len(self.files)))
Пример #14
0
    def __init__(self,
                 input_dir,
                 output_dir,
                 labels_to_extract=None,
                 in_ext=VALID_EXTENSIONS,
                 out_ext='jpg',
                 tag='default',
                 save_annotations=True,
                 straight_bbox=False,
                 add_margin=True,
                 draw_margin=False,
                 verbose=True):
        self.input_dir = coerce_to_path_and_check_exist(input_dir).absolute()
        self.files = get_files_from_dir(self.input_dir,
                                        valid_extensions=in_ext,
                                        recursive=True,
                                        sort=True)
        self.output_dir = coerce_to_path_and_create_dir(output_dir).absolute()
        self.out_extension = out_ext
        self.logger = get_logger(self.output_dir, name='extractor')
        model_path = coerce_to_path_and_check_exist(MODELS_PATH / tag /
                                                    MODEL_FILE)
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.model, (self.img_size, restricted_labels,
                     self.normalize) = load_model_from_path(
                         model_path,
                         device=self.device,
                         attributes_to_return=[
                             'train_resolution', 'restricted_labels',
                             'normalize'
                         ])
        self.model.eval()

        self.restricted_labels = sorted(restricted_labels)
        self.labels_to_extract = [
            1, 4
        ] if labels_to_extract is None else sorted(labels_to_extract)
        if not set(self.labels_to_extract).issubset(self.restricted_labels):
            raise ValueError(
                'Incompatible `labels_to_extract` and `tag` arguments: '
                f'model was trained using {self.restricted_labels} labels only'
            )

        self.save_annotations = save_annotations
        self.straight_bbox = straight_bbox
        self.add_margin = add_margin
        self.draw_margin = add_margin and draw_margin
        self.verbose = verbose
        self.print_and_log_info('Extractor initialised with kwargs {}'.format({
            'tag':
            tag,
            'labels_to_extract':
            self.labels_to_extract,
            'save_annotations':
            save_annotations,
            'straight_bbox':
            straight_bbox,
            'add_margin':
            add_margin,
            'draw_margin':
            draw_margin
        }))
        self.print_and_log_info(
            'Model characteristics: train_resolution={}, restricted_labels={}'.
            format(self.img_size, self.restricted_labels))
        self.print_and_log_info('Found {} input files to process'.format(
            len(self.files)))
Пример #15
0
)
logger = logging.getLogger(__name__)

# Directory in the root directory where the results will be saved
# Useful directories
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
TRAIN_DIR = os.path.join('dataset', 'train')
TRAIN_DIR_GT = os.path.join(TRAIN_DIR, 'gt')
OUTPUT_DIR = os.path.join(ROOT_DIR, 'output')
FIGURES_DIR = os.path.join(OUTPUT_DIR, 'figures')

IMG_SHAPE = (1080, 1920)

if __name__ == '__main__':
    # Get XML list from directory
    xml_gt = u.get_files_from_dir(TRAIN_DIR_GT)

    # Get GT from XML
    df_gt = u.get_bboxes_from_aicity(xml_gt)

    # Add noise to GT depending on noise parameter
    bboxes = u.add_noise_to_bboxes(df_gt,
                                   IMG_SHAPE,
                                   noise_size=True,
                                   noise_size_factor=5.0,
                                   noise_position=True,
                                   noise_position_factor=5.0)

    # Randomly create and destroy bounding boxes depending
    # on probability parameter
    bboxes = u.create_bboxes(bboxes, IMG_SHAPE, prob=0.5)
Пример #16
0
    :param l2f: dict of label-number to file-name.
    :param f_list: list of file-names.
    :param add_labels_set: bool which tell if needed to add a line of set of labels in the file.
    """
    csv_f = open('train_labels_filtered.csv', 'w')
    csv_f.write('Id,Class\n')
    labels_set = set()

    for f_name in f_list:
        # find the label of the file
        for label in l2f:
            if f_name in l2f[label]:
                csv_f.write('%s,%s\n' %
                            (f_name, label))  # write the file and its label
                labels_set.add(label)
                break  # continue to next file

    if add_labels_set:
        labels_set = sorted(labels_set)
        csv_f.write('\nlabels: ' + ','.join(labels_set))
    csv_f.close()


if __name__ == '__main__':
    label2files = csv_dict_to_new_dict()
    path = 'train50'

    ending = '.bytes'
    malware_file_list = utils.get_files_from_dir(path, ending)
    create_new_csv(label2files, malware_file_list)
Пример #17
0
import log
import utils


def convert(src_path, des_path):
    try:
        smali_jar_path = config.LIB_BAKSMALI_PATH
        cmd = "java -jar {0} d {1} -o {2}".format(smali_jar_path, src_path,
                                                  des_path)
        print(cmd)
        os.system(cmd)
    except Exception as e:
        log.write(filename="apk2smali.txt", message=str(e), remark=src_path)


if __name__ == "__main__":
    dir_path = "E:\\WorkPlaces\\PY_WorkPlace\\Tools\\APK_crawler\\apks"
    files = utils.get_files_from_dir(dir_path, file_type=".apk")
    count = len(files)
    print("Get {0} files".format(count))

    for file in files:
        des_path = os.path.join(config.DATA_BENIGN_SMALI_PATH,
                                file.split("\\")[-1])
        if os.pat.exist(des_path) == False:
            convert(file, des_path)
            count -= 1
            log.write(filename="apk2smali.txt",
                      message=str(count) + " left to convert",
                      type="INFO")
Пример #18
0
if train:
    RESULT_DIR = os.path.join('results', 'masks', 'train')
    IMAGE_DIR = TRAIN_DIR
else:
    RESULT_DIR = os.path.join('results', 'masks', 'test')
    IMAGE_DIR = os.path.join('dataset', 'test')

# If the directory already exists, delete it
if os.path.exists(RESULT_DIR):
    shutil.rmtree(RESULT_DIR)

# Create directory
os.makedirs(RESULT_DIR)

# Get list of test images in test directory
test_images = get_files_from_dir(IMAGE_DIR)
#test_images = os.listdir(TEST_DIR)

# Set threshold based on ranges of interest
ths_h = np.array([
    [0.0, 0.05],  # Red threshold
    [0.55, 0.65],  # Blue threshold
    [0.95, 1.0]  # Res threshold
])
ths_s = np.array([[0.0, 1.0]])
ths_v = np.array([[0.0, 1.0]])

# Get elapsed time
t0 = time.time()
t_frame = 0
Пример #19
0
logger = logging.getLogger(__name__)

if __name__ == '__main__':
    logger.info("Starting Museum Painting Retrieval")
    """
    ##################################  TASK 1: TEXT  ####################################
    """
    candidates = list()

    candidates = []

    if not os.path.exists("pkl/bboxes_iou_0.86658.pkl"):
        # Read groundtruth
        gt_annotations = ut.get_db(GTS_BBOXES_DIR)
        # Read images and find text_area
        for f in ut.get_files_from_dir(TRAIN_MUSEUM_DIR,
                                       excl_ext=['DS_Store']):
            img = ut.get_img(TRAIN_MUSEUM_DIR, f)
            candidates.append([
                ut.get_number_from_filename(f),
                text.get_text_area(
                    img, f, gt=gt_annotations[ut.get_number_from_filename(f)])
            ])

        # Sort bboxes
        candidates.sort(key=lambda x: x[0])
        candidates = [x[1] for x in candidates]

        # Compute intersection over union
        mean_iou = text.compute_mean_iou(candidates, gt_annotations)

        # Export pkl
Пример #20
0
    def __init__(self,
                 input_dir,
                 output_dir,
                 tag="default",
                 seg_fmt=SEG_GROUND_TRUTH_FMT,
                 labels_to_eval=None,
                 save_annotations=True,
                 labels_to_annot=None,
                 predict_bbox=False,
                 verbose=True):
        self.input_dir = coerce_to_path_and_check_exist(input_dir).absolute()
        self.files = get_files_from_dir(self.input_dir,
                                        valid_extensions=VALID_EXTENSIONS,
                                        recursive=True,
                                        sort=True)
        self.output_dir = coerce_to_path_and_create_dir(output_dir).absolute()
        self.seg_fmt = seg_fmt
        self.logger = get_logger(self.output_dir, name='evaluator')
        model_path = coerce_to_path_and_check_exist(MODELS_PATH / tag /
                                                    MODEL_FILE)
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.model, (self.img_size, restricted_labels,
                     self.normalize) = load_model_from_path(
                         model_path,
                         device=self.device,
                         attributes_to_return=[
                             'train_resolution', 'restricted_labels',
                             'normalize'
                         ])
        self.model.eval()

        self.restricted_labels = sorted(restricted_labels)
        self.labels_to_eval = [
            ILLUSTRATION_LABEL
        ] if labels_to_eval is None else sorted(labels_to_eval)
        self.labels_to_rm = set(self.restricted_labels).difference(
            self.labels_to_eval)
        assert len(
            set(self.labels_to_eval).intersection(
                self.restricted_labels)) == len(self.labels_to_eval)

        self.restricted_colors = [
            LABEL_TO_COLOR_MAPPING[l] for l in self.restricted_labels
        ]
        self.label_idx_color_mapping = {
            self.restricted_labels.index(l) + 1: c
            for l, c in zip(self.restricted_labels, self.restricted_colors)
        }
        self.color_label_idx_mapping = {
            c: l
            for l, c in self.label_idx_color_mapping.items()
        }

        self.metrics = defaultdict(lambda: RunningMetrics(
            self.restricted_labels, self.labels_to_eval))
        self.save_annotations = save_annotations
        self.labels_to_annot = labels_to_annot or self.labels_to_eval
        self.predict_bbox = predict_bbox
        self.verbose = verbose

        self.print_and_log_info('Output dir: {}'.format(
            self.output_dir.absolute()))
        self.print_and_log_info('Evaluator initialised with kwargs {}'.format({
            'labels_to_eval':
            self.labels_to_eval,
            'save_annotations':
            save_annotations
        }))
        self.print_and_log_info('Model tag: {}'.format(model_path.parent.name))
        self.print_and_log_info(
            'Model characteristics: train_resolution={}, restricted_labels={}'.
            format(self.img_size, self.restricted_labels))
        self.print_and_log_info('Found {} input files to process'.format(
            len(self.files)))