def collect_bow(self, array, ngram_types_array, posborder, negborder, nr): """ Collect Bag of words of array with specified array and ngrams Returns negative and positive bag of words """ bowObject = BagOfWords(array, self.tweet_class) negbow = {} posbow = {} # Create positive and negative bag of words for item in ngram_types_array: bowObject.create_corpus(item) posbow.update(bowObject.bow_partial(max_border=0 + posborder, min_border=-1, nr=nr)) negbow.update(bowObject.bow_partial(max_border=1, min_border=0 + negborder, nr=nr)) return (negbow, posbow)
def test_bow(): print 'BagOfWords Check:\n----------------------' try: d = ['this is the final cme193 assignment', 'i hope you learn some skills you can apply elsewhere'] bow = BagOfWords(top_n=8) bow.fit(d) X = bow.transform(d) ref = [[ 0., 1., 0., 1., 1., 1., 0., 0.], [ 2., 0., 1., 0., 0., 0., 1., 1.]] if not np.allclose(X, ref): print '[FAILED], incorrect representation' return False else: print '[PASSED]' return True except Exception: print '[FAILED], error in calculation' return False
def get_trained_model(engagement_key='share_count', cutoff=100, is_logistic=False): with gzip.open("../data/lapresse.json.gz", "rb") as f: data = json.loads(f.read().decode('utf-8')) def _map(x): if is_logistic: if x > 1 and x < 5: return 0 if x > 45: return 1 else: if x > 0 and x < cutoff: return x b = BagOfWords(_classifier if is_logistic else _regressor) b.train({ d['title']: _map(d['engagement'][engagement_key]) for d in data if _map(d['engagement'][engagement_key]) is not None }) return b
def set_pipeline(): return Pipeline([('bag-of-words', BagOfWords()), ('vectoring', DictVectorizer()), ('naive-bayes', BernoulliNB())])
def train_a_model(sourcefolder, extension, include_punctuation, maxfeatures, outputfolder): if not os.path.exists(outputfolder): os.makedirs(outputfolder) if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' if not outputfolder.endswith('/'): outputfolder = outputfolder + '/' # This just makes things easier. # Get a list of files. allthefiles = os.listdir(sourcefolder) # Now we have a list of file names. But we want volumeIDs, paired with complete # paths to the file. We're going to achieve the pairing by zipping two lists, # rather than with a dict, because ordering also matters here. volumeIDs = list() volumepaths = list() for filename in allthefiles: if filename.endswith(extension): volID = filename.replace(extension, "") # The volume ID is basically the filename minus its extension. # Extensions are likely to be long enough that there is little # danger of accidental occurrence inside a filename. E.g. # '.fic.tsv' path = sourcefolder + filename volumeIDs.append(volID) volumepaths.append(path) # Now we actually read volumes and create a training corpus, which will # be a list of bags of words. trainingset = list() for volID, filepath in zip(volumeIDs, volumepaths): volume = BagOfWords(filepath, volID, include_punctuation) # That reads the volume from disk. trainingset.append(volume) # We select the most common words as features. featurelist = select_common_features(trainingset, maxfeatures) numfeatures = len(featurelist) # Note that the number of features we actually got is not necessarily # the same as maxfeatures. for volume in trainingset: volume.selectfeatures(featurelist) volume.normalizefrequencies() # The volume now contains feature frequencies: # raw counts have been divided by the total number of words in the volume. standardizer = StandardizingVector(trainingset, featurelist) # This object calculates the means and standard deviations of all features # across the training set. listofvolumefeatures = list() for volume in trainingset: volume.standardizefrequencies(standardizer) # We have now converted frequencies to z scores. This is important for # regularized logistic regression -- otherwise the regularization # gets distributed unevenly across variables because they're scaled # differently. listofvolumefeatures.append(volume.features) # Now let's make a data frame by concatenating each volume as a separate column, # aligned on the features that index rows. data = pd.concat(listofvolumefeatures, axis = 1) data.columns = volumeIDs # Name the columns for volumes. Then transpose the matrix: data = data.T # So that we have a matrix with features (variables) as columns and instances (volumes) # as rows. Would have been easier to make this directly, but I don't know a neat # way to do it in pandas. classvector = epistolarymetadata.get_genrevector(volumeIDs, "nonepistolary / epistolary") # This part is going to be very specific to the model you train, so I've # encapsulated it in a separate module. For our purposes, it's just a function # that returns a pandas series of zeroes and ones indexed by volumeID. # zero = non, one = epistolary. logisticmodel = LogisticRegression(C = 1) classvector = classvector.astype('int') logisticmodel.fit(data, classvector) # Let's sort the features by their coefficient in the model, and print. coefficients = list(zip(logisticmodel.coef_[0], featurelist)) coefficients.sort() for coefficient, word in coefficients: print(word + " : " + str(coefficient)) # Pickle and write the model & standardizer. This will allow us to apply the model to # new documents of unknown genre. modelfile = outputfolder + "logisticmodel.p" with open(modelfile, mode = 'wb') as f: pickle.dump(logisticmodel, f) standardizerfile = outputfolder + "standardizer.p" with open(standardizerfile, mode = 'wb') as f: pickle.dump(standardizer, f) accuracy_tries = cross_validation.cross_val_score(logisticmodel, data, classvector, cv=5) print(accuracy_tries)
def train_a_model(sourcefolder, extension, include_punctuation, maxfeatures, outputfolder, classpath): if not os.path.exists(outputfolder): os.makedirs(outputfolder) if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' if not outputfolder.endswith('/'): outputfolder = outputfolder + '/' # This just makes things easier. # Get a list of files. allthefiles = os.listdir(sourcefolder) random.shuffle(allthefiles) # Now we have a list of file names. But we want volumeIDs, paired with complete # paths to the file. We're going to achieve the pairing by zipping two lists, # rather than with a dict, because ordering also matters here. volumeIDs = list() volumepaths = list() for filename in allthefiles: if filename.endswith(extension): volID = filename.replace(extension, "") # The volume ID is basically the filename minus its extension. # Extensions are likely to be long enough that there is little # danger of accidental occurrence inside a filename. E.g. # '.fic.tsv' path = sourcefolder + filename volumeIDs.append(volID) volumepaths.append(path) # Get the class vector, indexed by volume ID classvector = get_classvector(classpath, volumeIDs) assert len(classvector) == len(volumeIDs) # Now we actually read volumes and create a training corpus, which will # be a list of bags of words. trainingset = list() for volID, filepath in zip(volumeIDs, volumepaths): volume = BagOfWords(filepath, volID, include_punctuation) # That reads the volume from disk. trainingset.append(volume) # We select the most common words as features. featurelist = select_common_features(trainingset, maxfeatures) numfeatures = len(featurelist) # Note that the number of features we actually got is not necessarily # the same as maxfeatures. for volume in trainingset: volume.selectfeatures(featurelist) volume.normalizefrequencies() # The volume now contains feature frequencies: # raw counts have been divided by the total number of words in the volume. standardizer = StandardizingVector(trainingset, featurelist) # This object calculates the means and standard deviations of all features # across the training set. listofvolumefeatures = list() for volume in trainingset: volume.standardizefrequencies(standardizer) # We have now converted frequencies to z scores. This is important for # regularized logistic regression -- otherwise the regularization # gets distributed unevenly across variables because they're scaled # differently. listofvolumefeatures.append(volume.features) # Now let's make a data frame by concatenating each volume as a separate column, # aligned on the features that index rows. data = pd.concat(listofvolumefeatures, axis=1) data.columns = volumeIDs # Name the columns for volumes. Then transpose the matrix: data = data.T # So that we have a matrix with features (variables) as columns and instances (volumes) # as rows. Would have been easier to make this directly, but I don't know a neat # way to do it in pandas. logisticmodel = LogisticRegression(C=0.1) classvector = classvector.astype('int') logisticmodel.fit(data, classvector) # Let's sort the features by their coefficient in the model, and print. coefficients = list(zip(logisticmodel.coef_[0], featurelist)) coefficients.sort() for coefficient, word in coefficients: print(word + " : " + str(coefficient)) # Pickle and write the model & standardizer. This will allow us to apply the model to # new documents of unknown genre. modelfile = outputfolder + "logisticmodel.p" with open(modelfile, mode='wb') as f: pickle.dump(logisticmodel, f) standardizerfile = outputfolder + "standardizer.p" with open(standardizerfile, mode='wb') as f: pickle.dump(standardizer, f) accuracy_tries = cross_validation.cross_val_score(logisticmodel, data, classvector, cv=10) print(accuracy_tries) print(np.sum(accuracy_tries) / len(accuracy_tries)) random.shuffle(classvector) print('\nASSVECTOR!\n') accuracy_tries = cross_validation.cross_val_score(logisticmodel, data, classvector, cv=10) print(accuracy_tries) print(np.sum(accuracy_tries) / len(accuracy_tries))
def preprocess(data_path, is_testing, min_occurrences=5, cache_bow_output=None, cache_word2vec_output=None, duration=None): if duration: data = DataInitializer() data.initialize(data_path, is_testing, duration=duration) else: data = DataInitializer() data.initialize(data_path, is_testing) if os.path.isfile("data/BTC.csv"): prices_data = GetPricesData() prices_data.main() data = DataCleaning(data, is_testing) data.cleanup(DataCleaner(is_testing)) if is_testing: print("Testing data shape:", data.processed_data.shape) else: print("Training data shape:", data.processed_data.shape) data = Sentiments(data) data.sentiment_analysis_by_text() print("First five rows with sentiment: ", data.processed_data.head()) if is_testing: data.processed_data.to_csv("data/clean_test_with_sentiments.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) else: data.processed_data.to_csv("data/clean_train_with_sentiments.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) data = DataTokenize(data) data.tokenize() data.stem() data = WordList(data) data.build_wordlist(min_occurrences=min_occurrences) word2vec_data = data data = BagOfWords(data.processed_data, data.wordlist, is_testing) data.build_data_model() print("data model head: ", data.data_model.head(5)) """ Word 2 vec """ word2vec = Word2VecProvider() # REPLACE PATH TO THE FILE word2vec.load("../twitter/data/glove.twitter.27B.200d.txt") word2vec_data = RedditData(word2vec_data) word2vec_data.build_final_model(word2vec) word2vec_data_model = word2vec_data.data_model if "index" in word2vec_data_model.columns: word2vec_data_model.drop("index", axis=1, inplace=True) word2vec_data_model.dropna(axis=0, inplace=True) word2vec_data_model.reset_index(inplace=True) word2vec_data_model.index = word2vec_data_model['timestamp_ms'] print("final word2vec data model: \n", word2vec_data_model.head(), "\n") """ Tokenizing the data """ texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["summary"]: texts.append(text) for sentiment in data.processed_data["sentiment"]: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer(num_words=20000) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=200) print( "\n\n##################################################\npadded sequence head: \n", padded_sequences[0:5]) print( "\n####################################################\n padded sequence length \n", len(padded_sequences)) if not is_testing: data = Plotting(data) data.plot() if cache_bow_output is not None: data.data_model.to_csv(cache_bow_output, index=False, float_format="%.6f") word2vec_data_model.to_csv(cache_word2vec_output, index=False, float_format="%.6f") with open('sequences', 'wb') as fp: pickle.dump(padded_sequences, fp) with open('sentiments', 'wb') as fp: pickle.dump(sentiments, fp) return data.data_model, word2vec_data_model
def preprocess(data_path, is_testing, min_occurrences=5, cache_bow_output=None, cache_word2vec_output=None, duration=None, sentiment_method=None): if duration and cache_bow_output and cache_word2vec_output: data = DataInitializer() data.initialize(data_path, is_testing, duration=duration) elif cache_bow_output and cache_word2vec_output: data = DataInitializer() data.initialize(data_path, is_testing, cache_bow_output=cache_bow_output, cache_word2vec_output=cache_word2vec_output) else: data = DataInitializer() data.initialize(data_path, is_testing) if not os.path.isfile("data/Train_BTC.csv"): prices_data = GetPricesData() prices_data.main() if not os.path.isfile("data/Test_BTC.csv"): prices_data = GetPricesData() prices_data.main() data = DataCleaning(data, is_testing) data.cleanup(DataCleaner(is_testing)) if is_testing: print("Testing data shape:", data.processed_data.shape) else: print("Training data shape:", data.processed_data.shape) data = Sentiments(data, sentiment_method=sentiment_method) data.sentiment_analysis_by_text() print("First five rows with sentiment: ", data.processed_data.head()) if is_testing: data.processed_data.to_csv( "data/one_month_clean_test_data_with_prices.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) else: data.processed_data.to_csv("data/one_month_clean_data_with_prices.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) if os.path.isfile(cache_word2vec_output): print("cache_word2vec_output file name: ", cache_word2vec_output) word2vec_data_model = pd.read_csv(cache_word2vec_output) data.data_model = pd.read_csv(cache_bow_output) print("data model head: ", data.data_model.head(5)) else: data = DataTokenize(data) data.tokenize() data.stem() data = WordList(data) data.build_wordlist(min_occurrences=min_occurrences) word2vec_data = data data = BagOfWords(data.processed_data, data.wordlist, is_testing) data.build_data_model() print("data model head: ", data.data_model.head(5)) """ Word 2 vec """ word2vec = Word2VecProvider() # REPLACE PATH TO THE FILE word2vec.load("data/glove.twitter.27B.200d-with2num.txt") word2vec_data = TwitterData(word2vec_data) word2vec_data.build_final_model(word2vec) word2vec_data_model = word2vec_data.data_model if "original_id" in word2vec_data_model.columns: word2vec_data_model.drop("original_id", axis=1, inplace=True) word2vec_data_model.dropna(axis=0, inplace=True) word2vec_data_model.reset_index(inplace=True, drop=True) word2vec_data_model.index = word2vec_data_model['timestamp'] print("final word2vec data model: \n", word2vec_data_model.head(), "\n") # if not is_testing: # data = Plotting(data) # data.plot() if not is_testing: if not os.path.isfile("train_sequences"): print("\n##########################\n" "Tokenizing the tweets\n" "############################\n") texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["text"]: texts.append(text) for sentiment in data.processed_data['sentiment']: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=20, padding='post') padded_sequences = pd.DataFrame(data=padded_sequences) merged_train_data = pd.concat([ padded_sequences, data.processed_data[[ "high", "low", "open", "quoteVolume", "volume", "weightedAverage" ]] ], axis=1) train_targets = data.processed_data[["close"]] print("shape of merged train data: ", merged_train_data.shape) with open('data/train_sequences', 'wb') as fp: pickle.dump(merged_train_data, fp) with open('data/train_prices', 'wb') as fp: pickle.dump(train_targets, fp) # load the whole embedding into memory embeddings_index = dict() with open("data/glove.twitter.27B.200d-with2num.txt", "r", encoding="utf-8") as my_file: for line in my_file: values = line.split() word = values[0] coefs = numpy.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs # f.close() print("*" * 80, "\n" * 10) print('Loaded %s train word vectors.' % len(embeddings_index)) print('Total %s of word indexes.' % len(tokenizer.word_index)) with open('data/embeddings_index', 'wb') as fp: pickle.dump(embeddings_index, fp) with open('data/train_word_indexes', 'wb') as fp: pickle.dump(tokenizer.word_index, fp) # encode class values as integers # encoder = LabelEncoder() # encoder.fit(sentiments) # encoded_sentiments = encoder.transform(sentiments) # convert integers to dummy variables (i.e. one hot encoded) # dummy_sentiments = np_utils.to_categorical(encoded_sentiments) # for text in data.processed_data.loc[data.processed_data['sentiment'] != 0, "text"]: # texts.append(text) # # for sentiment in data.processed_data.loc[data.processed_data['sentiment'] != 0, "sentiment"]: # sentiments.append(sentiment) else: if not os.path.isfile("test_sequences"): print("\n##########################\n" "Tokenizing the tweets\n" "############################\n") texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["text"]: texts.append(text) for sentiment in data.processed_data['sentiment']: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=20, padding='post') padded_sequences = pd.DataFrame(data=padded_sequences) merged_test_data = pd.concat([ padded_sequences, data.processed_data[[ "high", "low", "open", "quoteVolume", "volume", "weightedAverage" ]] ], axis=1) test_targets = data.processed_data[["close"]] print("shape of merged test data: ", merged_test_data.shape) with open('data/test_sequences', 'wb') as fp: pickle.dump(merged_test_data, fp) with open('data/test_prices', 'wb') as fp: pickle.dump(test_targets, fp) with open('data/test_word_indexes', 'wb') as fp: pickle.dump(tokenizer.word_index, fp) # padded_sequences = pd.DataFrame(data=padded_sequences) print( "\n\n##################################################\npadded sequence head: \n", padded_sequences[0:5]) print( "\n####################################################\n padded sequence length \n", len(padded_sequences)) if not os.path.isfile(train_data_word2vec_file_name) or not os.path.isfile( test_data_word2vec_file_name): if cache_bow_output is not None: data.data_model.to_csv(cache_bow_output, index=False, float_format="%.6f") word2vec_data_model.to_csv(cache_word2vec_output, index=False, float_format="%.6f") return data.data_model, word2vec_data_model
print 'Starting final assignment testing...' TEST(test_bow()) TEST(test_regression_theta()) TEST(test_regression_pred()) yhat = False with open('labels.txt') as f: y = [int(l.strip()) for l in f.readlines()] with open('example_text.txt') as f: texts = [l.strip().lower() for l in f.readlines()] bow = BagOfWords(top_n=1500) bow.fit(texts) X = bow.transform(texts) #-CME193-START------------------- # !!!!!! PLEASE READ !!!!!! # you will need to create a new instance of LinearProbabilityModel, you will need # to call your .fit on X and y, and you will need to put your preductions # (after calling .predict) in a variable called yhat. #-CME193-END---------------------