def read_train_data(p=yelp_2013_train, validate_ratio=0.2, preprocess=True, int_label=True): print "reading raw training data..." f = open(p) documents = {} count = 0.0 for line in f.xreadlines(): count += 1 json_object = json.loads(line) text = json_object['text'] if preprocess: text = preprocess_review(text) if len(text) == 0: continue star = json_object['stars'] if int_label: star = int(star) if star in documents: documents[star].append(text) else: documents[star] = [text, ] sample_size = validate_ratio * count train_x = [] train_y = [] validate_x = [] validate_y = [] for star in documents: d = documents[star] star_sample_size = int(len(d) / count * sample_size) train_x.extend(d[star_sample_size:]) validate_x.extend(d[:star_sample_size]) train_y.extend([star] * (len(d) - star_sample_size)) validate_y.extend([star] * star_sample_size) return train_x, train_y, validate_x, validate_y
def read_data(p): label = 1 if p[-3:] == 'pos' else 0 f = open(p) x = [] for line in f: line = unicode(line, errors='ignore') x.append(preprocess_review(line, filters=SIMPLE_FILTERS)) y = [label] * len(x) return x, y
def grab_data(path): sentences = [] currdir = os.getcwd() os.chdir(path) for ff in glob.glob("*.txt"): with open(ff, 'r') as f: sentences.append(preprocess_review(f.readline().strip())) os.chdir(currdir) return sentences
def read_test_data(p=yelp_2013_test, preprocess=True, int_label=True): print "reading raw testing data..." f = open(p) test_x = [] test_y = [] for line in f.xreadlines(): json_object = json.loads(line) text = json_object['text'] if preprocess: text = preprocess_review(text) star = json_object['stars'] if int_label: star = int(star) test_x.append(text) test_y.append(star) return test_x, test_y
def read_tweet_raw(cutoff=10): # train dataset f = open(train_path) train_x = [] train_y = [] for line in f: data = line.split('","') label = int(data[0][1]) review = data[-1] if review[-1] == '\n': review = review[:-1] try: all_words = preprocess_review(review) except UnicodeDecodeError: continue if len(all_words) < cutoff: # filter continue train_x.append(" ".join(all_words)) train_y.append(label) f.close() return train_x, train_y