示例#1
0
def read_train_data(p=yelp_2013_train, validate_ratio=0.2, preprocess=True, int_label=True):
    print "reading raw training data..."
    f = open(p)
    documents = {}
    count = 0.0
    for line in f.xreadlines():
        count += 1
        json_object = json.loads(line)
        text = json_object['text']
        if preprocess:
            text = preprocess_review(text)
            if len(text) == 0:
                continue
        star = json_object['stars']
        if int_label:
            star = int(star)
        if star in documents:
            documents[star].append(text)
        else:
            documents[star] = [text, ]
    sample_size = validate_ratio * count
    train_x = []
    train_y = []
    validate_x = []
    validate_y = []
    for star in documents:
        d = documents[star]
        star_sample_size = int(len(d) / count * sample_size)
        train_x.extend(d[star_sample_size:])
        validate_x.extend(d[:star_sample_size])
        train_y.extend([star] * (len(d) - star_sample_size))
        validate_y.extend([star] * star_sample_size)
    return train_x, train_y, validate_x, validate_y
示例#2
0
def read_data(p):
    label = 1 if p[-3:] == 'pos' else 0
    f = open(p)
    x = []
    for line in f:
        line = unicode(line, errors='ignore')
        x.append(preprocess_review(line, filters=SIMPLE_FILTERS))
    y = [label] * len(x)
    return x, y
示例#3
0
def grab_data(path):
    sentences = []
    currdir = os.getcwd()
    os.chdir(path)
    for ff in glob.glob("*.txt"):
        with open(ff, 'r') as f:
            sentences.append(preprocess_review(f.readline().strip()))
    os.chdir(currdir)
    return sentences
示例#4
0
def read_test_data(p=yelp_2013_test, preprocess=True, int_label=True):
    print "reading raw testing data..."
    f = open(p)
    test_x = []
    test_y = []
    for line in f.xreadlines():
        json_object = json.loads(line)
        text = json_object['text']
        if preprocess:
            text = preprocess_review(text)
        star = json_object['stars']
        if int_label:
            star = int(star)
        test_x.append(text)
        test_y.append(star)
    return test_x, test_y
示例#5
0
def read_tweet_raw(cutoff=10):
    # train dataset
    f = open(train_path)
    train_x = []
    train_y = []
    for line in f:
        data = line.split('","')
        label = int(data[0][1])
        review = data[-1]
        if review[-1] == '\n':
            review = review[:-1]
        try:
            all_words = preprocess_review(review)
        except UnicodeDecodeError:
            continue
        if len(all_words) < cutoff:  # filter
            continue
        train_x.append(" ".join(all_words))
        train_y.append(label)
    f.close()

    return train_x, train_y