#!/usr/bin/env python import os, sys import re from itertools import izip_longest import random import csv import logging import numpy as np import data_utils from data_utils import get_file, to_one_hot, syslogger # update field size csv.field_size_limit(sys.maxsize) # setup logging logger = data_utils.syslogger(__name__) class BadRecordException(Exception): pass class TextTooShortException(Exception): pass def enforce_length(txt, min_length=None, max_length=None, pad_out=False): if min_length is not None: if len(txt) < min_length: raise TextTooShortException() if max_length is not None: if len(txt) > max_length: # truncate txt (from end) return txt[0:max_length] if pad_out is True:
import random import csv import logging import numpy as np import data_utils from data_utils import tokenize, tokenize_hanzi from zipfile import ZipFile from data_utils import get_file, to_one_hot, syslogger download_all_csvs = False # update field size csv.field_size_limit(sys.maxsize) # setup logging logger = data_utils.syslogger(__name__) class BadRecordException(Exception): pass class TextTooShortException(Exception): pass def enforce_length(txt, min_length=None, max_length=None, pad_out=False): if min_length is not None: if len(txt) < min_length: raise TextTooShortException() if max_length is not None:
import os import gzip import json import logging import h5py import shutil from data_utils import get_file, syslogger logger = syslogger(__name__) class BoringException(Exception): pass class AmazonReviews: def __init__(self, file_path='/data/amazon/reviews_Home_and_Kitchen.json.gz', amazon_url = "http://snap.stanford.edu/data/amazon/" "productGraph/categoryFiles/" "reviews_Home_and_Kitchen.json.gz"): # download the data if necessary self.file_path = file_path data_root = self.download_data(file_path, amazon_url) # initialize the number of samples self.samples = 0