def get_data(data_file): if "tsv" in data_file: positive_texts, negative_texts = data_helper.get_reviews( os.path.join(DATA_DIR, data_file)) return positive_texts, negative_texts else: category_text = data_helper.get_reviews( os.path.join(DATA_DIR, data_file)) return category_text
def build_features(data_file, feat_name, binning): # read text data category_texts = data_helper.get_reviews(os.path.join("./", data_file)) # build features features_category_tuples, texts = get_features_category_tuples( category_texts, feat_name, binning) return features_category_tuples, texts
def features_stub(path=os.path.join(DATA_DIR, "train_examples.tsv"), feature_set="word_features"): positive_texts, negative_texts = data_helper.get_reviews(path) category_texts = {"positive": positive_texts, "negative": negative_texts} features_category_tuples, texts = get_features_category_tuples( category_texts, feature_set) #print(features_category_tuples) return features_category_tuples, texts
def build_features(data_file, feat_name): # read text data positive_texts, negative_texts = data_helper.get_reviews( os.path.join(DATA_DIR, data_file)) #print(data_helper.get_reviews(os.path.join(DATA_DIR, data_file))["positive"]) category_texts = {"positive": positive_texts, "negative": negative_texts} #print(category_texts["positive"]) # build features features_category_tuples, texts = get_features_category_tuples( category_texts, feat_name) #print(features_category_tuples) return features_category_tuples, texts
def build_features(data_file, feat_name, save_feats=None): # read text data if data_file == "test.txt": test_texts = data_helper.get_reviews(os.path.join(DATA_DIR, data_file)) category_texts = {"test data": test_texts} else: positive_texts, negative_texts = data_helper.get_reviews( os.path.join(DATA_DIR, data_file)) category_texts = { "positive": positive_texts, "negative": negative_texts } # build features features_category_tuples, texts = get_features_category_tuples( category_texts, feat_name) # save features to file if save_feats is not None: write_features_category(features_category_tuples, save_feats) return features_category_tuples, texts
def features_stub(): datafile = "restaurant-training.data" raw_data = data_helper.read_file(os.path.join(DATA_DIR, datafile)) positive_texts, negative_texts = data_helper.get_reviews(raw_data) category_texts = {"positive": positive_texts, "negative": negative_texts} feature_set = "word_features" features_category_tuples, texts = get_features_category_tuples( category_texts, feature_set) raise NotImplemented filename = "???" write_features_category(features_category_tuples, filename)
def features_stub(): datafile = "imdb-training.data" raw_data = data_helper.read_file(datafile) positive_texts, negative_texts = data_helper.get_reviews(raw_data) category_texts = {"positive": positive_texts, "negative": negative_texts} # FEATURE_SETS = {"word_pos_features", "word_features", "word_pos_liwc_features", "word_pos_opinion_features"} feature_set = "word_pos_opinion_features" features_category_tuples, texts = get_features_category_tuples( category_texts, feature_set) # raise NotImplemented filename = feature_set + "-testing-features.txt" write_features_category(features_category_tuples, filename)
def build_features(data_file, feat_name, save_feats=None, binning=False): # read text liwc positive_texts, negative_texts = data_helper.get_reviews( os.path.join(DATA_DIR, data_file)) category_texts = {"positive": positive_texts, "negative": negative_texts} # build features features_category_tuples, texts = features.get_features_category_tuples( category_texts, feat_name, data_file) # save features to file if save_feats is not None: write_features_category(features_category_tuples, save_feats) return features_category_tuples, texts
def build_features(data_file, feat_name, save_feats=None, binning=False): # read text data raw_data = data_helper.read_file(data_file) positive_texts, negative_texts = data_helper.get_reviews(raw_data) category_texts = {"positive": positive_texts, "negative": negative_texts} # build features features_category_tuples, texts = get_features_category_tuples( category_texts, feat_name) # save features to file if save_feats is not None: write_features_category(features_category_tuples, save_feats) return features_category_tuples, texts
def main(reviews, output): model = build_classifier("svm") model = train_word_embem_model(model) file = open(output, "w+") dev_data = "dev_examples.tsv" dev_feats, dev_text = get_we_feat(dev_data) #print(dev_feats) acc, cm = evaluate(model, dev_feats, dev_text) dir_list = os.listdir(path) for d in dir_list: file_list = os.listdir(path + d) for f in file_list: texts = data_helper.get_reviews(os.path.join(path, d, f)) for text in texts: w2v_feat = features.get_word_embedding_features(text) file.write(model.classify(w2v_feat) + " " + text + "\n")
def build_features(data_file, feat_name, binning, save_feats=None, test=False): # read text data positive_texts, negative_texts = data_helper.get_reviews(os.path.join(DATA_DIR, data_file)) if test: category_texts = {"unknown": positive_texts, "unknown2": negative_texts} else: category_texts = {"positive": positive_texts, "negative": negative_texts} # build features features_category_tuples, texts = get_features_category_tuples(category_texts, feat_name, binning) # save features to file if save_feats is not None: write_features_category(features_category_tuples, save_feats, test) return features_category_tuples, texts
def predict(feat_set, eval_data, train_data, out): model = train_model(train_data, feat_set) texts = data_helper.get_reviews(os.path.join(DATA_DIR, eval_data)) fout = open(out, "w+") for text in texts: words, tags = features.get_words_tags(text) feature_vectors = {} if feat_set == "word_features": feature_vectors.update(features.get_ngram_features(words)) elif feat_set == "word_pos_features": feature_vectors.update(features.get_ngram_features(words)) feature_vectors.update(features.get_pos_features(tags)) elif feat_set == "word_pos_liwc_features": feature_vectors.update(features.get_ngram_features(words)) feature_vectors.update(features.get_pos_features(tags)) feature_vectors.update(features.get_liwc_features(words)) fout.write(model.classify(feature_vectors) + " " + text + "\n") return
def build_features(data_file, feat_name): # read text data positive_texts, negative_texts = data_helper.get_reviews( os.path.join(DATA_DIR, data_file)) category_texts = {"positive": positive_texts, "negative": negative_texts} # build features features_category_tuples, texts = get_features_category_tuples( category_texts, feat_name) #labelDict = {} #for item in features_category_tuples: # for tup in item[0]: #go through items in dict # if tup == "BIGRAM_service_friendly" and item[0][tup] == 1: # print(tup) # labelDict[(tup, item[0][tup])] = item[1] #print(labelDict) return features_category_tuples, texts
def build_features(data_file, feat_name): # read text data positive_texts, negative_texts = data_helper.get_reviews( os.path.join(DATA_DIR, data_file)) category_texts = {"positive": positive_texts, "negative": negative_texts} # build features features_category_tuples, texts = get_features_category_tuples( category_texts, feat_name) # save features to file datamap = {} # print("Feature: " + str(feat_name)+"_ Data: "+str(data_file)) datamap["dev_examples.tsv"] = "development" datamap["train_examples.tsv"] = "training" write_features_category(features_category_tuples, str( feat_name)+"-"+datamap[str(data_file)]) return features_category_tuples, texts
def train_eval(train_file, eval_file, review_file, feature_set, pred_file): # train the model split_name = "train" model = train_model(train_file, feature_set, split_name) # evaluate the model if model is None: model = get_classifier(classifier_fname) copy = sys.stdout sys.stdout = open(pred_file, 'a') print("Using " + feature_set) print(eval_file) features_data, texts = build_features(eval_file, feature_set) accuracy, cm = evaluate(model, features_data, texts, data_set_name="eval-{}".format(feature_set)) print("\nThe accuracy of {} is: {}".format(eval_file, accuracy)) #print("Confusion Matrix:") #print(str(cm)) if "test" in review_file: texts = data_helper.get_reviews(os.path.join(DATA_DIR, review_file)) for text in texts: words, tags = features.get_words_tags(text) feature_vectors = {} feature_vectors.update(features.get_ngram_features(words)) print(model.classify(feature_vectors) + " " + text + "\n") else: features_data, texts = build_features(review_file, feature_set) accuracy, cm = evaluate(model, features_data, texts, data_set_name="eval-{}".format(feature_set)) print(review_file) print("\nThe accuracy of {} is: {}".format(eval_file, accuracy)) #print("Confusion Matrix:") #print(str(cm)) sys.stdout = copy
def features_stub(): feature_sets = [ "word_features", "word_pos_features", "word_pos_liwc_features", "word_pos_opinion_features" ] datasets = ["training", "development", "testing"] for dataset in datasets: for feature_set in feature_sets: datafile = "data/imdb-" + dataset + ".data" raw_data = data_helper.read_file(datafile) positive_texts, negative_texts = data_helper.get_reviews(raw_data) category_texts = { "positive": positive_texts, "negative": negative_texts } features_category_tuples, texts = get_features_category_tuples( category_texts, feature_set) filename = "best_features/" + feature_set + "-" + dataset + "-features.txt" write_features_category(features_category_tuples, filename)
def features_stub(): datafiles = [ "imdb-training.data", "imdb-testing.data", "imdb-development.data" ] featuresets = [ "word_pos_features", "word_features", "word_pos_liwc_features", "word_pos_opinion_features" ] for datafile in datafiles: for feature_set in featuresets: raw_data = data_helper.read_file(datafile) positive_texts, negative_texts = data_helper.get_reviews(raw_data) category_texts = { "positive": positive_texts, "negative": negative_texts } features_category_tuples, texts = get_features_category_tuples( category_texts, feature_set) data_set = re.search(r'-[a-z]+', datafile).group() filename = f'{feature_set}{data_set}-features.txt' write_features_category(features_category_tuples, filename)
def write_features(): feat_sets = [ "word_features", "word_pos_features", "word_pos_liwc_features" ] data_sets = ["-training", "-development", "-testing"] end = "-features.txt" for data in data_sets: for feat in feat_sets: fout = open(feat + data + end, "w+") if data == "-training": file = "train_examples.tsv" if data == "-development": file = "dev_examples.tsv" if data == "-testing": file = "test.txt" positive_texts, negative_texts = data_helper.get_reviews( os.path.join(DATA_DIR, file)) for text in positive_texts: words, tags = features.get_words_tags(text) fout.write("positive ") temp_vec = {} if feat == "word_features": print(features.get_ngram_features(words).keys) elif feat == "word_pos_features": print(features.get_ngram_features(words).keys) print(features.get_pos_features(tags).keys) elif feat == "word_pos_liwc_features": features.get_ngram_features(words) features.get_pos_features(tags) features.get_liwc_features(words) return
def build_w2vec(data_file): # read text data category_texts = data_helper.get_reviews(os.path.join("./", data_file)) w2vec_feats = get_w2v(category_texts) return w2vec_feats
def features_stub(filename): # open restaurant-training.data # calls data_helper.py to put file in pos or neg category list # here is where I would call other files as well datafile = filename raw_data = data_helper.read_file(datafile) positive_texts, negative_texts = data_helper.get_reviews(raw_data) # category_texts creates # { posive, [... all positive reviews ] , negative, [...all neg ...] } # #category_texts = {"positive": positive_texts, "negative": negative_texts} #feature_set = "word_features" positive_toks = [] positive_pos_toks = [] negative_toks = [] negative_pos_toks = [] print('begin tokenize') # get word and pos tokens not the most # efficient but easier to trace for documents in positive_texts: positive_toks += get_words(documents) for documents in negative_texts: negative_toks += get_words(documents) for documents in positive_texts: positive_pos_toks += get_pos(documents) for documents in negative_texts: negative_pos_toks += get_pos(documents) print('tokenizing compl') # get ngrams for positive and negative categories posi_word_ngram = {} posi_pos_ngram = {} neg_word_ngram = {} neg_pos_ngram = {} print('begin word ngram') #for tokens in positive_toks: # posi_word_ngram.update( get_ngram_features( tokens ) ) posi_word_ngram.update(get_ngram_features(positive_toks)) print('all positive word ngram completed') print('begin negative word ngram') #for tokens in negative_toks: # neg_word_ngram.update( get_ngram_features( tokens ) ) neg_word_ngram.update(get_ngram_features(negative_toks)) print('all negative word ngram completed') print('end word ngram') print('begin pos ngram') #for tokens in positive_toks: # posi_pos_ngram.update( get_ngram_features( tokens ) ) posi_pos_ngram.update(get_ngram_features(positive_pos_toks)) print('all pos pos ngram completed') print('begin negative ngram') #for tokens in negative_toks: # neg_pos_ngram.update( get_ngram_features( tokens ) ) neg_pos_ngram.update(get_ngram_features(negative_pos_toks)) print('all negative pos ngram completed') print('end pos ngram') print('begin liwc') # get LIWC features posi_liwc_feat = get_liwc_features(positive_toks) neg_liwc_feat = get_liwc_features(negative_toks) print('end liwc') print('begin file write') print(posi_liwc_feat) print(neg_liwc_feat) fwrite_feature_vectors(filename, posi_word_ngram, neg_word_ngram, posi_pos_ngram, neg_pos_ngram, posi_liwc_feat, neg_liwc_feat)
return features_category_tuples, all_texts def write_features_category(features_category_tuples, outfile_name): """ Save the feature values to file. :param features_category_tuples: :param outfile_name: :return: """ with open(outfile_name, "w", encoding="utf-8") as fout: for (features, category) in features_category_tuples: fout.write("{0:<10s}\t{1}\n".format(category, features)) if __name__ == "__main__": #file = open("./data/test.txt") #text = "this is not a love hate love hotdog. not a hotdog. i love sandwiches wing wong" #for text in file: #print(get_liwc_features(nltk.word_tokenize("this is not a love hate love hotdog. not a hotdog. i love sandwiches wing wong"))) #words, tags = get_words_tags("this is not a love hate love hotdog. not a hotdog. i love sandwiches wing wong") #print(words) #print(get_word_pos_features(text)) out = open("restaurant-competition-model-P1-predictions.txt", "w") #sys.stdout = out for review in data_helper.get_reviews("./data/test.txt"): print(get_word_features(review)) sys.stdout = sys.__stdout__ pass