Пример #1
0
 def __init__(self, trip_date, dpath):
     service_dates = pd.DataFrame()
     for fi in effective_files(trip_date, dpath=dpath):
         try:
             with ZipFile(fi) as zf:
                 raw_text = zf.read('calendar.txt')
             csvdata = StringIO(raw_text)
             service_dates = service_dates.append(pd.read_csv(csvdata))
         except:
             print 'Error reading calendar from ' + fi
     service_dates.set_index('service_id',
                             drop=True,
                             inplace=True,
                             verify_integrity=True)
     self.service_dates = service_dates
     service_exc = pd.DataFrame(
         columns=['service_id', 'date', 'exception_type'])
     for fi in effective_files(trip_date, dpath=dpath):
         try:
             with ZipFile(fi) as zf:
                 raw_text = zf.read('calendar_dates.txt')
             csvdata = StringIO(raw_text)
             service_exc = service_exc.append(
                 pd.read_csv(csvdata, dtype={
                     0: str,
                     1: str,
                     2: int
                 }))
         except:
             print 'Error reading calendar_dates from ' + fi
     # service_exc.set_index('date',drop=True,inplace=True)
     self.service_exc = service_exc
Пример #2
0
def get_data_famafrench(name):
    # path of zip files
    zip_file_path = '{0}/{1}.zip'.format(_FAMAFRENCH_URL, name)

    with urlopen(zip_file_path) as url:
        raw = url.read()

    with tempfile.TemporaryFile() as tmpf:
        tmpf.write(raw)

        with ZipFile(tmpf, 'r') as zf:
            data = zf.open(zf.namelist()[0]).readlines()

    line_lengths = np.array(lmap(len, data))
    file_edges = np.where(line_lengths == 2)[0]

    datasets = {}
    edges = zip(file_edges + 1, file_edges[1:])
    for i, (left_edge, right_edge) in enumerate(edges):
        dataset = [d.split() for d in data[left_edge:right_edge]]
        if len(dataset) > 10:
            ncol_raw = np.array(lmap(len, dataset))
            ncol = np.median(ncol_raw)
            header_index = np.where(ncol_raw == ncol - 1)[0][-1]
            header = dataset[header_index]
            ds_header = dataset[header_index + 1:]
            # to ensure the header is unique
            header = [
                '{0} {1}'.format(j, hj) for j, hj in enumerate(header, start=1)
            ]
            index = np.array([d[0] for d in ds_header], dtype=int)
            dataset = np.array([d[1:] for d in ds_header], dtype=float)
            datasets[i] = DataFrame(dataset, index, columns=header)

    return datasets
Пример #3
0
def get_bookcrossing_data(local_file=None,
                          get_ratings=True,
                          get_users=False,
                          get_books=False):
    if not local_file:
        # downloading data
        from requests import get
        zip_file_url = 'http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip'
        zip_response = get(zip_file_url)
        zip_contents = BytesIO(zip_response.content)
    else:
        zip_contents = local_file

    ratings = users = books = None

    with ZipFile(zip_contents) as zfile:
        zip_files = pd.Series(zfile.namelist())
        zip_file = zip_files[zip_files.str.contains('ratings', flags=2)].iat[0]

        delimiter = ';'
        if get_ratings:
            zdata = zfile.read(zip_file)
            ratings = pd.read_csv(BytesIO(zdata),
                                  sep=delimiter,
                                  header=0,
                                  engine='c',
                                  encoding='unicode_escape')

        if get_users:
            zip_file = zip_files[zip_files.str.contains('users',
                                                        flags=2)].iat[0]
            with zfile.open(zip_file) as zdata:
                users = pd.read_csv(zdata,
                                    sep=delimiter,
                                    header=0,
                                    engine='c',
                                    encoding='unicode_escape')

        if get_books:
            zip_file = zip_files[zip_files.str.contains('books',
                                                        flags=2)].iat[0]
            with zfile.open(zip_file) as zdata:
                books = pd.read_csv(
                    zdata,
                    sep=delimiter,
                    header=0,
                    engine='c',
                    quoting=1,
                    escapechar='\\',
                    encoding='unicode_escape',
                    usecols=['ISBN', 'Book-Author', 'Publisher'])

    res = [
        data.rename(columns=lambda x: x.lower().replace('book-', '').replace(
            '-id', 'id'),
                    copy=False) for data in [ratings, users, books]
        if data is not None
    ]
    if len(res) == 1: res = res[0]
    return res
Пример #4
0
def get_movielens_data(local_file=None, get_genres=False):
    '''Downloads movielens data and stores it in pandas dataframe.
    '''
    if not local_file:
        #print 'Downloading data...'
        zip_file_url = 'http://files.grouplens.org/datasets/movielens/ml-10m.zip'
        zip_response = get(zip_file_url)
        zip_contents = StringIO(zip_response.content)
        #print 'Done.'
    else:
        zip_contents = local_file

    #print 'Loading data into memory...'
    with ZipFile(zip_contents) as zfile:
        zip_files = pd.Series(zfile.namelist())
        zip_file = zip_files[zip_files.str.endswith('ratings.dat')].iat[0]
        zdata = zfile.read(zip_file)
        delimiter = ';'
        zdata = zdata.replace('::', delimiter) # makes data compatible with pandas c-engine
        ml_data = pd.read_csv(StringIO(zdata), sep=delimiter, header=None, engine='c',
                                names=['userid', 'movieid', 'rating', 'timestamp'],
                                usecols=['userid', 'movieid', 'rating'])

        if get_genres:
            zip_file = zip_files[zip_files.str.endswith('movies.dat')].iat[0]
            with zfile.open(zip_file) as zdata:
                delimiter = '::'
                genres_data = pd.read_csv(zdata, sep=delimiter, header=None, engine='python',
                                            names=['movieid', 'movienm', 'genres'])

            ml_genres = split_genres(genres_data)
            ml_data = (ml_data, ml_genres)

    return ml_data
Пример #5
0
    def _read_zipfile(self, url):

        zipf = BytesIO(self._get_response(url).content)

        with ZipFile(zipf, 'r') as zf:
            data = zf.open(zf.namelist()[0]).read().decode()

        return data
    def _read_zipfile(self, url):
        raw = self._get_response(url).content

        with tempfile.TemporaryFile() as tmpf:
            tmpf.write(raw)

            with ZipFile(tmpf, 'r') as zf:
                data = zf.open(zf.namelist()[0]).read().decode()

        return data
Пример #7
0
def _download_data_famafrench(name):
    url = ''.join([_URL, _URL_PREFIX, name, _URL_SUFFIX])
    with urlopen(url) as socket:
        raw = socket.read()

    with tempfile.TemporaryFile() as tmpf:
        tmpf.write(raw)

        with ZipFile(tmpf, 'r') as zf:
            data = zf.open(zf.namelist()[0]).read().decode()

    return data
Пример #8
0
    def _read_zipfile(self, ftppath):

        zipf = BytesIO()
        try:
            self._sec_ftp_session.retrbinary('RETR ' + ftppath, zipf.write)
        except EOFError:
            raise RemoteDataError('FTP server has closed the connection.')
        zipf.seek(0)
        with ZipFile(zipf, 'r') as zf:
            data = zf.open(zf.namelist()[0]).read().decode()

        return StringIO(data)
Пример #9
0
    def _read_one_month(self, symbol, year, month):
        symbol = symbol.replace("/", "").upper()
        filename_cache = os.path.join(self.cache_directory, self._filename(symbol, year, month, '.zip'))
        zip_data, from_file_cache = self._get(symbol, year, month, filename_cache, 'bytes')

        with ZipFile(zip_data, 'r') as zf:
            zfile = zf.open(self._filename(symbol, year, month, '.csv'))
            df = pd.read_csv(zfile, names=['Symbol', 'Date', 'Bid', 'Ask'])

        df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d %H:%M:%S.%f')
        df = df.set_index('Date')
    
        return df
Пример #10
0
def load_trips(trip_date, dpath):
    # dpath argument is the path to the directory containing subdirectories
    trips = pd.DataFrame()
    for fi in effective_files(trip_date, dpath=dpath):
        try:
            with ZipFile(fi) as zf:
                raw_text = zf.read('trips.txt')
            csvdata = StringIO(raw_text)
            trips = trips.append(pd.read_csv(csvdata))
        except:
            print 'Error reading from ' + fi
    trips.set_index('trip_id', drop=True, inplace=True, verify_integrity=True)
    return trips
def main():
    expire_after = timedelta(days=1)
    if PY2:
        filename = 'cache_py2'
    else:
        filename = 'cache'
    session = requests_cache.CachedSession(cache_name=filename,
                                           expire_after=expire_after)

    dt = pd.to_datetime("2014-01-01")
    symbol = "AUD/USD"
    symbol = symbol.replace("/", "").upper()
    year = dt.year
    month = dt.month
    month_name = datetime.datetime(year=1970, month=month,
                                   day=1).strftime('%B').upper()
    #url = "http://www.truefx.com/dev/data/2014/JANUARY-2014/AUDUSD-2014-01.zip"
    url = "http://www.truefx.com/dev/data/{year:04d}/{month_name}-{year:04d}/{symbol}-{year:04d}-{month:02d}.zip".format(
        year=year, month=month, symbol=symbol, month_name=month_name)
    response = session.get(url)
    zip_data = BytesIO(response.content)
    filename = "{symbol}-{year:04d}-{month:02d}.csv".format(year=year,
                                                            month=month,
                                                            symbol=symbol)

    with ZipFile(zip_data, 'r') as zf:
        #filename = zf.namelist()[0]
        zfile = zf.open(filename)
        #print(zfile)
        #(symb, dt, ask, bid) = zfile.read().split(',')
        #print(zfile.__dict__)
        data = zfile.readlines()
        #df = pd.read_csv(zfile._fileobj)  # ToFix: can't make it work correctly

    #return
    df = pd.DataFrame(data)
    #df = df[:100] # just for test
    df[0] = df[0].str.decode('utf8')
    df[0] = df[0].str.replace('\n', '')
    df[0] = df[0].map(lambda s: s.split(','))
    df['Symbol'] = df[0].map(lambda t: t[0])
    df['Date'] = df[0].map(lambda t: pd.to_datetime(t[1]))
    df['Bid'] = df[0].map(lambda t: t[2]).astype(float)
    df['Ask'] = df[0].map(lambda t: t[3]).astype(float)
    del df[0]
    df = df.set_index('Date')
    print(df)
Пример #12
0
def get_bx_data(file_path, get_ratings=True, get_users=False, get_books=False):
    ratings = users = books = None
    with ZipFile(file_path) as zfile:
        zip_files = pd.Series(zfile.namelist())
        zip_file = zip_files[zip_files.str.contains('ratings', flags=2)].iat[0]

        delimiter = ';'
        if get_ratings:
            zdata = zfile.read(zip_file)
            ratings = pd.read_csv(StringIO(zdata),
                                  sep=delimiter,
                                  header=0,
                                  engine='c')

        if get_users:
            zip_file = zip_files[zip_files.str.contains('users',
                                                        flags=2)].iat[0]
            with zfile.open(zip_file) as zdata:
                users = pd.read_csv(
                    zdata,
                    sep=delimiter,
                    header=0,
                    engine='c',
                )

        if get_books:
            zip_file = zip_files[zip_files.str.contains('books',
                                                        flags=2)].iat[0]
            with zfile.open(zip_file) as zdata:
                books = pd.read_csv(
                    zdata,
                    sep=delimiter,
                    header=0,
                    engine='c',
                    quoting=1,
                    escapechar='\\',
                    usecols=['ISBN', 'Book-Author', 'Publisher'])

    res = [
        data.rename(columns=lambda x: x.lower().replace('book-', '').replace(
            '-id', 'id')) for data in [ratings, users, books]
        if data is not None
    ]
    return res
def get_list(pathName=default_zip_file, input_filters_dict={}):
    '''
    Read the entire database of fonts to find out what unique entries are 
    available.
    
    Parameters
    ---------------
        pathName : the path of the zip file containing the database of characters
        input_filters_dict : a dictionary containing columns in the .csv file to
            be extracted.  keys = column heading, values = value to be 
            allowed in that column.  Returns an entire column if a key is not
            provided for it.
        
    Returns
    --------------
        a dataframe of all the all the unique lines in the dataset.
        
    Example:
    --------------    
    print(ocr_utils.get_list(columns=('font','fontVariant')))    

    '''

    # speed up list if only the font list is needed
    try:
        if (len(input_filters_dict) == 1) and (len(input_filters_dict['font'])
                                               == 0):
            with ZipFile(pathName, 'r') as myzip:
                y = sorted(myzip.namelist())
            for i, l in enumerate(y):
                y[i] = [l.replace('.csv', '')]
            return y
    except:
        pass

    df = read_file(pathName, input_filters_dict)
    df = df.loc[:, :'r0c0']
    keys = list(input_filters_dict.keys())
    df = df[keys]
    df = df.drop_duplicates()
    return df
Пример #14
0
def load_stops(trip_date, dpath, clean=True):
    # dpath argument is the path to the directory containing zips for each
    # agency or borough
    # clean=True eliminates duplicates that occur due to stops being shared by
    # multiple lines across GTFS files.  Method will return an error if this is set
    # to false but duplicates exist
    stops = pd.DataFrame()
    for fi in effective_files(trip_date, dpath=dpath):
        try:
            with ZipFile(fi) as zf:
                raw_text = zf.read('stops.txt')
            times_df = pd.read_csv(StringIO(raw_text))
            stops = stops.append(times_df)
        except:
            print 'Error reading from ' + fi
    if clean == True:
        stops.drop_duplicates(subset='stop_id', inplace=True)
    else:
        pass
    stops.set_index('stop_id', drop=True, inplace=True, verify_integrity=True)
    return stops
Пример #15
0
def load_stop_times(trip_date, dpath):
    # dpath argument is the path to the directory containing zips for each
    # agency or borough
    # NOTE: returns times as string dtype (HH:MM:SS)
    stop_times = pd.DataFrame()
    agency_df = pd.DataFrame()
    for fi in effective_files(trip_date, dpath=dpath):
        try:
            with ZipFile(fi) as zf:
                raw_text = zf.read('stop_times.txt')
                agency = zf.read('agency.txt')
            stop_times = stop_times.append(pd.read_csv(StringIO(raw_text)))
            agency_df = agency_df.append(pd.read_csv(StringIO(agency)))
        except:
            print 'Error reading from ' + fi
    stop_times.set_index(['trip_id', 'stop_id'], drop=True, inplace=True)
    tz_string = agency_df['agency_timezone'].unique()
    if len(tz_string) == 1:
        tz_string = tz_string[0]
    else:
        tz_string = 'error: zero or multiple zones'
    return stop_times, tz_string
Пример #16
0
def build_metadata(dpath):
    for root, dirs, files in os.walk(dpath):
        mdata = pd.DataFrame(
            columns=['file', 'min_eff', 'max_disc', 'file_date'])
        for fname in files:
            try:
                with ZipFile(root + '/' + fname) as zf:
                    raw_text = zf.read('calendar.txt')
                    fdatetime = zf.getinfo('calendar.txt').date_time
                fdate = (str(fdatetime[0]) + '-' + str(fdatetime[1]).zfill(2) +
                         '-' + str(fdatetime[2]).zfill(2))
                csvdata = pd.read_csv(StringIO(raw_text))
                min_e = str(min(csvdata['start_date']))
                min_eff = min_e[:4] + '-' + min_e[4:6] + '-' + min_e[6:8]
                max_d = str(max(csvdata['end_date']))
                max_disc = max_d[:4] + '-' + max_d[4:6] + '-' + max_d[6:8]
                nrow = {'file':fname,'min_eff':min_eff,'max_disc':max_disc,\
                        'file_date':fdate}
                mdata = mdata.append(nrow, ignore_index=True)
            except:
                print 'Error reading from ' + fname
        mdata.to_csv(root + '/metadata.txt', index=False)
Пример #17
0
    def _get_one(self, name, *args, **kwargs):
        url = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/{name}.zip'\
            .format(name=name)

        response = self.session.get(url)
        raw = response.content  # returns bytes (.text returns unicode ; .content returns byte)
        if response.status_code != 200:
            raise IOError("Failed to get the data. Check that {0!r} is "
                          "a valid FamaFrench dataset.".format(name))

        with tempfile.TemporaryFile() as tmpf:
            tmpf.write(raw)

            with ZipFile(tmpf, 'r') as zf:
                data = zf.open(zf.namelist()[0]).readlines()

        line_lengths = np.array(lmap(len, data))
        file_edges = np.where(line_lengths == 2)[0]

        datasets = {}
        edges = zip(file_edges + 1, file_edges[1:])
        for i, (left_edge, right_edge) in enumerate(edges):
            dataset = [d.split() for d in data[left_edge:right_edge]]
            if len(dataset) > 10:
                ncol_raw = np.array(lmap(len, dataset))
                ncol = np.median(ncol_raw)
                header_index = np.where(ncol_raw == ncol - 1)[0][-1]
                header = dataset[header_index]
                ds_header = dataset[header_index + 1:]
                # to ensure the header is unique
                header = [
                    '{0} {1}'.format(j, hj)
                    for j, hj in enumerate(header, start=1)
                ]
                index = np.array([d[0] for d in ds_header], dtype=int)
                dataset = np.array([d[1:] for d in ds_header], dtype=float)
                datasets[i] = pd.DataFrame(dataset, index, columns=header)

        return datasets
Пример #18
0
def get_data_famafrench(name):
    # path of zip files
    zip_file_url = ('http://mba.tuck.dartmouth.edu/pages/faculty/'
                    'ken.french/ftp/')
    zip_file_path = '{0}{1}.zip'.format(zip_file_url, name)

    with urlopen(zip_file_path) as url:
        raw = url.read()

    with tempfile.TemporaryFile() as tmpf:
        tmpf.write(raw)

        with ZipFile(tmpf, 'r') as zf:
            data = zf.read(name + '.txt').splitlines()

    line_lengths = np.array(map(len, data))
    file_edges = np.where(line_lengths)[0]

    datasets = {}
    edges = itertools.izip(file_edges[:-1], file_edges[1:])
    for i, (left_edge, right_edge) in enumerate(edges):
        dataset = [d.split() for d in data[left_edge:right_edge]]
        if len(dataset) > 10:
            ncol_raw = np.array(map(len, dataset))
            ncol = np.median(ncol_raw)
            header_index = np.where(ncol_raw == ncol - 1)[0][-1]
            header = dataset[header_index]
            ds_header = dataset[header_index + 1:]
            # to ensure the header is unique
            header = [
                '{0} {1}'.format(*items)
                for items in enumerate(header, start=1)
            ]
            index = np.fromiter((d[0] for d in ds_header), dtype=int)
            dataset = np.fromiter((d[1:] for d in ds_header), dtype=float)
            datasets[i] = DataFrame(dataset, index, columns=header)

    return datasets
Пример #19
0
def get_movielens_data(local_file=None,
                       get_ratings=True,
                       get_genres=False,
                       split_genres=True,
                       mdb_mapping=False,
                       get_tags=False,
                       include_time=False):
    '''Downloads movielens data and stores it in pandas dataframe.
    '''
    fields = ['userid', 'movieid', 'rating']

    if include_time:
        fields.append('timestamp')

    if not local_file:
        # downloading data
        from requests import get
        zip_file_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
        zip_response = get(zip_file_url)
        zip_contents = BytesIO(zip_response.content)
    else:
        zip_contents = local_file

    ml_data = ml_genres = ml_tags = mapping = None
    # loading data into memory
    with ZipFile(zip_contents) as zfile:
        zip_files = pd.Series(zfile.namelist())
        zip_file = zip_files[zip_files.str.contains('ratings')].iat[0]
        is_new_format = ('latest' in zip_file) or ('20m' in zip_file)
        delimiter = ','
        header = 0 if is_new_format else None
        if get_ratings:
            zdata = zfile.read(zip_file)
            zdata = zdata.replace(b'::', delimiter.encode())
            # makes data compatible with pandas c-engine
            # returns string objects instead of bytes in that case
            ml_data = pd.read_csv(BytesIO(zdata),
                                  sep=delimiter,
                                  header=header,
                                  engine='c',
                                  names=fields,
                                  usecols=fields)

        if get_genres:
            zip_file = zip_files[zip_files.str.contains('movies')].iat[0]
            zdata = zfile.read(zip_file)
            if not is_new_format:
                # make data compatible with pandas c-engine
                # pandas returns string objects instead of bytes in that case
                delimiter = '^'
                zdata = zdata.replace(b'::', delimiter.encode())
            genres_data = pd.read_csv(BytesIO(zdata),
                                      sep=delimiter,
                                      header=header,
                                      engine='c',
                                      encoding='unicode_escape',
                                      names=['movieid', 'movienm', 'genres'])

            ml_genres = get_split_genres(
                genres_data) if split_genres else genres_data

        if get_tags:
            zip_file = zip_files[zip_files.str.contains('/tags')].iat[
                0]  #not genome
            zdata = zfile.read(zip_file)
            if not is_new_format:
                # make data compatible with pandas c-engine
                # pandas returns string objects instead of bytes in that case
                delimiter = '^'
                zdata = zdata.replace(b'::', delimiter.encode())
            fields[2] = 'tag'
            ml_tags = pd.read_csv(BytesIO(zdata),
                                  sep=delimiter,
                                  header=header,
                                  engine='c',
                                  encoding='latin1',
                                  names=fields,
                                  usecols=range(len(fields)))

        if mdb_mapping and is_new_format:
            # imdb and tmdb mapping - exists only in ml-latest or 20m datasets
            zip_file = zip_files[zip_files.str.contains('links')].iat[0]
            with zfile.open(zip_file) as zdata:
                mapping = pd.read_csv(zdata,
                                      sep=',',
                                      header=0,
                                      engine='c',
                                      names=['movieid', 'imdbid', 'tmdbid'])

    res = [
        data for data in [ml_data, ml_genres, ml_tags, mapping]
        if data is not None
    ]
    if len(res) == 1: res = res[0]
    return res
def get_data(geo, resolution, session):
    geo = conv_geo(geo)
    resolution = conv_resol(resolution)
    response = download_data(geo, resolution, session)
    logger.info("Request done")

    logger.info("Create stream file")
    zip_data = compat.BytesIO(response.content)

    logger.info("Creating a DataFrame per symbol")

    d = {}
    cols = None
    with ZipFile(zip_data, 'r') as zf:
        filelist = zf.filelist
        df_info = pd.DataFrame(filelist)
        df_info['filename'] = df_info[0].map(lambda x: x.filename)
        df_info['file_size'] = df_info[0].map(lambda x: x.file_size)
        df_info['date_time'] = df_info[0].map(
            lambda x: datetime.datetime(*x.date_time))
        del df_info[0]
        for zinfo in filelist:
            filename = zinfo.filename
            filename_short, filename_ext = os.path.splitext(filename)
            with zf.open(filename) as zfile:
                if filename_ext.lower() == '.txt':
                    file_exchange = filename.split('/')[3]
                    file_symbol = os.path.split(filename_short)[-1].upper()
                    logger.info(
                        "Building DataFrame for '%s' at '%s' from '%s' (%.1f)"
                        % (file_symbol, file_exchange, filename,
                           float(zinfo.file_size) / 1024))
                    if zinfo.file_size > 0:
                        try:
                            if resolution == 'd':
                                df = pd.read_csv(zfile, parse_dates=0)
                            else:
                                df = pd.read_csv(zfile, parse_dates=[[0, 1]])
                                df = df.rename(columns={'Date_Time': 'Date'})
                            df = df.set_index('Date')
                            df['Exchange'] = file_exchange
                            d[file_symbol] = df
                            if cols is None:
                                cols = df.columns
                        except KeyboardInterrupt:
                            logger.error("CTRL+C was pressed - exit")
                            break
                        except Exception as e:
                            logger.error(
                                "Can't build DataFrame for '%s' at '%s' from '%s'"
                                % (file_symbol, file_exchange,
                                   filename.replace(' ', '\ ')))
                            logger.error(traceback.format_exc())
                            d[file_symbol] = None
                            df['Exchange'] = file_exchange
                    else:
                        logger.error(
                            "Can't build DataFrame for '%s' at '%s' from '%s' (empty file)"
                            % (file_symbol, file_exchange,
                               filename.replace(' ', '\ ')))
                        d[file_symbol] = None
                        df['Exchange'] = file_exchange
    logger.info("Create Panel from DataFrame")
    panel = pd.Panel(d)
    panel = panel.transpose(2, 1, 0)
    panel.major_axis = panel.major_axis.map(lambda n: pd.to_datetime(str(n)))
    return (panel, df_info)
def read_file(pathName, input_filters_dict, random_state=None):
    '''
    Reads the .csv file containing the labeled data images.
    
    Parameters
    ------------
        pathName :path name of the zip file containing all the training data
        
        input_filters_dict: ['font', font_name] 
            a font_name is a string or tuple containing a list of the fonts
                to be read from the database or
                empty to return all fonts
            or a string containing a single font name
            or None which will return all fonts.
            
        random_state: None for random seed chosen by the system
            or integer seed for the random seed for repeatable calls
            
    Returns   
    ------------
    a pandas shuffled Dataframe containing the columns from the csv file 
    
    Note: The file to be read is a .zip file that in turn contains .csv
        files. Each .csv file contains images for a given font.
        This make access to a font, such as OCRA, fast because only one
        .csv file needs to be accessed.
     '''

    if os.path.exists(pathName) == False:
        print('{} does not exist!  Downloading it from the web'.format(
            default_zip_file),
              flush=True)
        downloadFile('http://lyman.house/download/{}'.format(default_zip_file))
        #downloadFile('http://lyman.house/download/fonts_chinese.zip')

    try:
        rd_font = input_filters_dict['font']
        if isinstance(rd_font, str):
            rd_font = (rd_font, )
    except:
        rd_font = ()

    with ZipFile(pathName, 'r') as myzip:
        if len(rd_font) == 0:
            names = myzip.namelist()
            print('\nreading all files...please wait')
            df = pd.concat(
                apply_column_filters(pd.read_csv(myzip.open(fname, 'r')),
                                     input_filters_dict) for fname in names)
        else:
            try:
                df = pd.concat(
                    apply_column_filters(
                        pd.read_csv(myzip.open(font + ".csv", 'r')),
                        input_filters_dict) for font in rd_font)
            except:
                raise ValueError(
                    'Could not find font file {} in the zip file'.format(
                        rd_font))
        myzip.close()
    assert df.size > 0

    return df.sample(frac=1, random_state=random_state)
Пример #22
0
def get_movielens_data(local_file=None,
                       get_ratings=True,
                       get_genres=False,
                       split_genres=True,
                       db_mapping=False):
    '''Downloads movielens data and stores it in pandas dataframe.
    '''
    if not local_file:
        # downloading data
        zip_file_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
        zip_response = get(zip_file_url)
        zip_contents = StringIO(zip_response.content)
    else:
        zip_contents = local_file

    ml_data = ml_genres = mapping = None
    # loading data into memory
    with ZipFile(zip_contents) as zfile:
        zip_files = pd.Series(zfile.namelist())
        zip_file = zip_files[zip_files.str.contains('ratings')].iat[0]
        is_latest = 'latest' in zip_file
        header = 0 if is_latest else None
        if get_ratings:
            zdata = zfile.read(zip_file)
            delimiter = ','
            zdata = zdata.replace(
                '::', delimiter)  # makes data compatible with pandas c-engine
            ml_data = pd.read_csv(
                StringIO(zdata),
                sep=delimiter,
                header=header,
                engine='c',
                names=['userid', 'movieid', 'rating', 'timestamp'],
                usecols=['userid', 'movieid', 'rating'])

        if get_genres:
            zip_file = zip_files[zip_files.str.contains('movies')].iat[0]
            with zfile.open(zip_file) as zdata:
                delimiter = ',' if is_latest else '::'
                genres_data = pd.read_csv(
                    zdata,
                    sep=delimiter,
                    header=header,
                    engine='python',
                    names=['movieid', 'movienm', 'genres'])

            ml_genres = get_split_genres(
                genres_data) if split_genres else genres_data

        if is_latest and db_mapping:
            # imdb and tmdb mapping - exists only in ml-latest datasets
            zip_file = zip_files[zip_files.str.contains('links')].iat[0]
            with zfile.open(zip_file) as zdata:
                mapping = pd.read_csv(zdata,
                                      sep=',',
                                      header=0,
                                      engine='c',
                                      names=['movieid', 'imdbid', 'tmdbid'])

    res = [data for data in [ml_data, ml_genres, mapping] if data is not None]
    if len(res) == 1: res = res[0]
    return res