예제 #1
0
    def _safe_extract_all(self, zipfile, target_dir):
        """Safer version of ZipFile.extractall -- does not allow absolute or upwards-relative paths"""
        for zipinfo in zipfile.infolist():
            # skip absolute or upwards-relative files
            if zipinfo.filename.startswith(('/', '..')):
                warnings.warn('Skipping potentially unsafe file: ' + zipinfo.filename, RuntimeWarning)
                continue

            # target_dir is base directory; extract will create subpaths as necessary
            zipfile.extract(zipinfo, target_dir)
def safe_extract_from_zip(zipfile, filename, out_dir):
    try:
        return zipfile.extract(filename, out_dir)
    except (KeyError) as e:
        pass

    return ''
예제 #3
0
def uncompress(srcfile, destdir):
    import gzip
    import tarfile, zipfile
    file = os.path.basename(srcfile)
    if os.path.isfile(file):
        shortname, fmt = os.path.splitext(file)
        fmt = fmt[1:]
        if fmt in ('tgz', 'tar'):
            try:
                tar = tarfile.open(srcfile)
                names = tar.getnames()
                for name in names:
                    tar.extract(name, destdir)
                tar.close()
            except Exception as e:
                print("Can't uncompress {} for {}".format(file, e))
        elif fmt == 'zip':
            try:
                zipfile = zipfile.ZipFile(srcfile)
                for names in zipfile.namelist():
                    zipfile.extract(names, destdir)
                zipfile.close()
            except Exception as e:
                print("Can't uncompress {} for {}".format(file, e))
        elif fmt == 'gz':
            try:
                fname = os.path.join(destdir, os.path.basename(srcfile))
                gfile = gzip.GzipFile(srcfile)
                open(fname, "w+").write(gfile.read())
                # gzip对象用read()打开后,写入open()建立的文件中。
                gfile.close()
                # 关闭gzip对象
            except Exception as e:
                return False, e, fmt
        '''
        elif fmt == 'rar':
            try:
                rar = rarfile.RarFile(srcfile)  
                os.chdir(destdir)
                rar.extractall()  
                rar.close()  
            except Exception as e :
                return (False, e, filefmt)
        '''
    else:
        print('文件格式不支持或者不是压缩文件')
    return None
def unzip(zip_file_path):
    try:
        zip_file = zipfile.ZipFile(zip_file_path, 'r')
        content_files = zip_file.namelist()
        for file_name in content_files:
            if os.path.splitext(file_name)[1] == '.dblog':
                zipfile.extract(zip_dir, file_name)
                return zip_dir + '\\' + file_name

        return '' # no debug log files found in the zip file
    except:
        if (os.path.getsize(zip_file_path) == 0):
            print('empty file')
        else:
            print("FAILED to extract: " + zip_dir + '\\' + zip_file_name)

        return ''
예제 #5
0
def arff_to_df(URL=URL_child,
               arff_file='Autism-Child-Data.arff',
               force_download=False):
    if force_download or not os.path.exists(arff_file):
        zipped = urlretrieve(URL, 'autism.zip')
        zipfile = ZipFile(zipped[0], 'r')
        arff_file = zipfile.extract(arff_file)

    ##extracting the data dictionary and column names (description)

    data, description = arff.loadarff(arff_file)

    columns = [i for i in description]

    new_columns = [
        'A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
        'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
        'ethnicity', 'jaundice', 'autism', 'country_of_res', 'used_app_before',
        'result', 'age_range', 'relation', 'Class/ASD'
    ]

    df = pd.DataFrame(data, columns=columns)

    df.columns = new_columns

    #changing utf-8 coding to categorical variables
    for column in list(df.select_dtypes(include=['object']).columns):
        df[column] = df[column].str.decode('utf-8').astype('category')

    #Fixing ethnicity discrepancies
    df.ethnicity = df.ethnicity.str.replace('?', "Unknown").str.replace(
        ' ',
        '_').str.replace('\'Middle_Eastern_\'', "Middle_Eastern").str.replace(
            '\'South_Asian\'',
            'South_Asian').str.replace('others', 'Others').astype('category')

    #Fill in missing age with median age
    df.age = df.age.fillna(value=df.age.median())

    #Fixing relation discrepancies

    relation_mapper = {
        'Parent': 'Family Member',
        'Relative': 'Family Member',
        '\'Health care professional\'': 'Health care professional',
        '?': 'Unknown',
        'Self': 'Self',
        'self': 'Self'
    }

    df.relation = df.relation.map(relation_mapper).astype('category')
    df.relation = df.relation.fillna("Unknown")

    #Fixing various binary inputs
    jaun_mapper = {'yes': 1, 'no': 0}
    aut_mapper = {'yes': 1, 'no': 0}
    class_mapper = {'YES': 1, 'NO': 0}
    app_mapper = {'yes': 1, 'no': 0}

    mapper_list = list([jaun_mapper, aut_mapper, class_mapper, app_mapper])

    for x, y in zip(['jaundice', 'autism', 'Class/ASD', 'used_app_before'],
                    mapper_list):
        df[x] = df[x].map(y)

    df = df.drop('country_of_res', axis=1)

    return df