def test_baseline(src, tar): prefix = '../dataset/Sentiment/' suffix = ['/positive.review', '/negative.review', '/unlabeled.review'] src_pos, src_pos_label = data_helper.read_file(prefix + src + suffix[0]) src_neg, src_neg_label = data_helper.read_file(prefix + src + suffix[1]) tar_pos, tar_pos_label = data_helper.read_file(prefix + tar + suffix[0]) tar_neg, tar_neg_label = data_helper.read_file(prefix + tar + suffix[1]) transformer = TfidfTransformer() cv_src = CountVectorizer(min_df=20) # x_src = cv_src.fit_transform(src_pos + src_neg).toarray() x_src = transformer.fit_transform(cv_src.fit_transform(src_pos + src_neg)).toarray() y_src = src_pos_label + src_neg_label lr = LogisticRegression(solver='lbfgs', C=C) lr.fit(x_src, y_src) # x_tar = cv_src.transform(tar_pos + tar_neg).toarray() x_tar = transformer.fit_transform(cv_src.transform(tar_pos + tar_neg)).toarray() y_tar = tar_pos_label + tar_neg_label acc = lr.score(x_tar, y_tar) return acc
def _readData(self, filePath): """ :param filePath: :return: """ labels, s = data_helper.read_file(filePath) contents = [] for i in range(len(labels)): content = self.cleanReview(s[i]) contents.append(content) return labels, contents
def features_stub(): datafile = "restaurant-training.data" raw_data = data_helper.read_file(os.path.join(DATA_DIR, datafile)) positive_texts, negative_texts = data_helper.get_reviews(raw_data) category_texts = {"positive": positive_texts, "negative": negative_texts} feature_set = "word_features" features_category_tuples, texts = get_features_category_tuples( category_texts, feature_set) raise NotImplemented filename = "???" write_features_category(features_category_tuples, filename)
def features_stub(): datafile = "imdb-training.data" raw_data = data_helper.read_file(datafile) positive_texts, negative_texts = data_helper.get_reviews(raw_data) category_texts = {"positive": positive_texts, "negative": negative_texts} # FEATURE_SETS = {"word_pos_features", "word_features", "word_pos_liwc_features", "word_pos_opinion_features"} feature_set = "word_pos_opinion_features" features_category_tuples, texts = get_features_category_tuples( category_texts, feature_set) # raise NotImplemented filename = feature_set + "-testing-features.txt" write_features_category(features_category_tuples, filename)
def build_features(data_file, feat_name, save_feats=None, binning=False): # read text data raw_data = data_helper.read_file(data_file) positive_texts, negative_texts = data_helper.get_reviews(raw_data) category_texts = {"positive": positive_texts, "negative": negative_texts} # build features features_category_tuples, texts = get_features_category_tuples( category_texts, feat_name) # save features to file if save_feats is not None: write_features_category(features_category_tuples, save_feats) return features_category_tuples, texts
def test(src, tar, pivot_num, pivot_min_times, dim, C): weight_path = './weight/' + src + '2' + tar + '_svd_' + str(dim) + '.npy' W = np.load(weight_path) pivot_path = './pivot/' + src + '2' + tar + '_pivot_' + str( pivot_num) + '_' + str(pivot_min_times) with open(pivot_path, 'rb') as f: pivot = pickle.load(f) prefix = '../dataset/Sentiment/' suffix = ['/positive.review', '/negative.review', '/unlabeled.review'] src_pos, src_pos_label = data_helper.read_file(prefix + src + suffix[0]) src_neg, src_neg_label = data_helper.read_file(prefix + src + suffix[1]) src_unl, src_unl_label = data_helper.read_file(prefix + src + suffix[2]) tar_pos, tar_pos_label = data_helper.read_file(prefix + tar + suffix[0]) tar_neg, tar_neg_label = data_helper.read_file(prefix + tar + suffix[1]) tar_unl, tar_unl_label = data_helper.read_file(prefix + tar + suffix[2]) # transformer_src = TfidfTransformer() cv_src = CountVectorizer(min_df=20) x_src = cv_src.fit_transform(src_pos + src_neg).toarray() # x_src = transformer_src.fit_transform(cv_src.fit_transform(src_pos + src_neg)).toarray() cv_lab_unl = CountVectorizer(min_df=40) x_lab_unl = cv_lab_unl.fit_transform(src_unl + src_pos + src_neg + tar_unl).toarray() # transformer_lab_unl = TfidfTransformer() # x_lab_unl = transformer_lab_unl.fit_transform(cv_lab_unl.fit_transform(src_unl + src_pos + src_neg + tar_unl)).toarray() # x_src_transform = transformer_lab_unl.transform(cv_lab_unl.transform(src_pos + src_neg)).toarray() x_src_transform = cv_lab_unl.transform(src_pos + src_neg).toarray() x_src_transform = np.delete(x_src_transform, pivot, 1) x_src_transform = x_src_transform.dot(W) x_src = np.concatenate((x_src, x_src_transform), axis=1) y_src = src_pos_label + src_neg_label lr = LogisticRegression(solver='lbfgs', C=C) lr.fit(x_src, y_src) # x_tar_transform = transformer_lab_unl.transform(cv_lab_unl.transform(tar_pos + tar_neg)).toarray() x_tar_transform = cv_lab_unl.transform(tar_pos + tar_neg).toarray() x_tar_transform = np.delete(x_tar_transform, pivot, 1) x_tar_transform = x_tar_transform.dot(W) # x_tar = transformer_src.transform(cv_src.transform(tar_pos + tar_neg)).toarray() x_tar = cv_src.transform(tar_pos + tar_neg).toarray() x_tar = np.concatenate((x_tar, x_tar_transform), axis=1) y_tar = tar_pos_label + tar_neg_label acc = lr.score(x_tar, y_tar) return acc
def features_stub(): feature_sets = [ "word_features", "word_pos_features", "word_pos_liwc_features", "word_pos_opinion_features" ] datasets = ["training", "development", "testing"] for dataset in datasets: for feature_set in feature_sets: datafile = "data/imdb-" + dataset + ".data" raw_data = data_helper.read_file(datafile) positive_texts, negative_texts = data_helper.get_reviews(raw_data) category_texts = { "positive": positive_texts, "negative": negative_texts } features_category_tuples, texts = get_features_category_tuples( category_texts, feature_set) filename = "best_features/" + feature_set + "-" + dataset + "-features.txt" write_features_category(features_category_tuples, filename)
def features_stub(): datafiles = [ "imdb-training.data", "imdb-testing.data", "imdb-development.data" ] featuresets = [ "word_pos_features", "word_features", "word_pos_liwc_features", "word_pos_opinion_features" ] for datafile in datafiles: for feature_set in featuresets: raw_data = data_helper.read_file(datafile) positive_texts, negative_texts = data_helper.get_reviews(raw_data) category_texts = { "positive": positive_texts, "negative": negative_texts } features_category_tuples, texts = get_features_category_tuples( category_texts, feature_set) data_set = re.search(r'-[a-z]+', datafile).group() filename = f'{feature_set}{data_set}-features.txt' write_features_category(features_category_tuples, filename)
def features_stub(filename): # open restaurant-training.data # calls data_helper.py to put file in pos or neg category list # here is where I would call other files as well datafile = filename raw_data = data_helper.read_file(datafile) positive_texts, negative_texts = data_helper.get_reviews(raw_data) # category_texts creates # { posive, [... all positive reviews ] , negative, [...all neg ...] } # #category_texts = {"positive": positive_texts, "negative": negative_texts} #feature_set = "word_features" positive_toks = [] positive_pos_toks = [] negative_toks = [] negative_pos_toks = [] print('begin tokenize') # get word and pos tokens not the most # efficient but easier to trace for documents in positive_texts: positive_toks += get_words(documents) for documents in negative_texts: negative_toks += get_words(documents) for documents in positive_texts: positive_pos_toks += get_pos(documents) for documents in negative_texts: negative_pos_toks += get_pos(documents) print('tokenizing compl') # get ngrams for positive and negative categories posi_word_ngram = {} posi_pos_ngram = {} neg_word_ngram = {} neg_pos_ngram = {} print('begin word ngram') #for tokens in positive_toks: # posi_word_ngram.update( get_ngram_features( tokens ) ) posi_word_ngram.update(get_ngram_features(positive_toks)) print('all positive word ngram completed') print('begin negative word ngram') #for tokens in negative_toks: # neg_word_ngram.update( get_ngram_features( tokens ) ) neg_word_ngram.update(get_ngram_features(negative_toks)) print('all negative word ngram completed') print('end word ngram') print('begin pos ngram') #for tokens in positive_toks: # posi_pos_ngram.update( get_ngram_features( tokens ) ) posi_pos_ngram.update(get_ngram_features(positive_pos_toks)) print('all pos pos ngram completed') print('begin negative ngram') #for tokens in negative_toks: # neg_pos_ngram.update( get_ngram_features( tokens ) ) neg_pos_ngram.update(get_ngram_features(negative_pos_toks)) print('all negative pos ngram completed') print('end pos ngram') print('begin liwc') # get LIWC features posi_liwc_feat = get_liwc_features(positive_toks) neg_liwc_feat = get_liwc_features(negative_toks) print('end liwc') print('begin file write') print(posi_liwc_feat) print(neg_liwc_feat) fwrite_feature_vectors(filename, posi_word_ngram, neg_word_ngram, posi_pos_ngram, neg_pos_ngram, posi_liwc_feat, neg_liwc_feat)