예제 #1
0
#!/usr/bin/env python
import os, sys
import re
from itertools import izip_longest
import random
import csv
import logging
import numpy as np
import data_utils
from data_utils import get_file, to_one_hot, syslogger

# update field size
csv.field_size_limit(sys.maxsize)

# setup logging
logger = data_utils.syslogger(__name__)

class BadRecordException(Exception):
    pass
class TextTooShortException(Exception):
    pass

def enforce_length(txt, min_length=None, max_length=None, pad_out=False):
    if min_length is not None:
        if len(txt) < min_length:
            raise TextTooShortException()
    if max_length is not None:
        if len(txt) > max_length:
            # truncate txt (from end)
            return txt[0:max_length]
    if pad_out is True:
예제 #2
0
import random
import csv
import logging
import numpy as np
import data_utils
from data_utils import tokenize, tokenize_hanzi
from zipfile import ZipFile
from data_utils import get_file, to_one_hot, syslogger

download_all_csvs = False

# update field size
csv.field_size_limit(sys.maxsize)

# setup logging
logger = data_utils.syslogger(__name__)


class BadRecordException(Exception):
    pass


class TextTooShortException(Exception):
    pass


def enforce_length(txt, min_length=None, max_length=None, pad_out=False):
    if min_length is not None:
        if len(txt) < min_length:
            raise TextTooShortException()
    if max_length is not None:
예제 #3
0
import os
import gzip
import json
import logging
import h5py
import shutil
from data_utils import get_file, syslogger
logger = syslogger(__name__)


class BoringException(Exception):
    pass


class AmazonReviews:


    def __init__(self, file_path='/data/amazon/reviews_Home_and_Kitchen.json.gz',
                        amazon_url =   "http://snap.stanford.edu/data/amazon/"
                                       "productGraph/categoryFiles/"
                                       "reviews_Home_and_Kitchen.json.gz"):

        # download the data if necessary
        self.file_path = file_path
        data_root = self.download_data(file_path, amazon_url)

        # initialize the number of samples
        self.samples = 0