示例#1
0
    def collect_bow(self, array, ngram_types_array, posborder, negborder, nr):
        """ Collect Bag of words of array with specified array and ngrams
		Returns negative and positive bag of words
		"""

        bowObject = BagOfWords(array, self.tweet_class)
        negbow = {}
        posbow = {}

        # Create positive and negative bag of words
        for item in ngram_types_array:
            bowObject.create_corpus(item)
            posbow.update(bowObject.bow_partial(max_border=0 + posborder, min_border=-1, nr=nr))
            negbow.update(bowObject.bow_partial(max_border=1, min_border=0 + negborder, nr=nr))

        return (negbow, posbow)
示例#2
0
文件: run.py 项目: xmli/cme193
def test_bow():
	print 'BagOfWords Check:\n----------------------'
	try:
		d = ['this is the final cme193 assignment', 'i hope you learn some skills you can apply elsewhere']
		bow = BagOfWords(top_n=8)
		bow.fit(d)
		X = bow.transform(d)
		ref = [[ 0.,  1.,  0.,  1.,  1.,  1.,  0.,  0.],
			   [ 2.,  0.,  1.,  0.,  0.,  0.,  1.,  1.]]

		
		if not np.allclose(X, ref):
			print '[FAILED], incorrect representation' 
			return False
		else:
			print '[PASSED]' 
			return True
	except Exception:
		print '[FAILED], error in calculation' 
		return False
示例#3
0
def get_trained_model(engagement_key='share_count',
                      cutoff=100,
                      is_logistic=False):
    with gzip.open("../data/lapresse.json.gz", "rb") as f:
        data = json.loads(f.read().decode('utf-8'))

    def _map(x):
        if is_logistic:
            if x > 1 and x < 5:
                return 0
            if x > 45:
                return 1
        else:
            if x > 0 and x < cutoff:
                return x

    b = BagOfWords(_classifier if is_logistic else _regressor)
    b.train({
        d['title']: _map(d['engagement'][engagement_key])
        for d in data if _map(d['engagement'][engagement_key]) is not None
    })
    return b
def set_pipeline():
    return Pipeline([('bag-of-words', BagOfWords()),
                     ('vectoring', DictVectorizer()),
                     ('naive-bayes', BernoulliNB())])
示例#5
0
def train_a_model(sourcefolder, extension, include_punctuation, maxfeatures, outputfolder):

	if not os.path.exists(outputfolder):
		os.makedirs(outputfolder)

	if not sourcefolder.endswith('/'):
		sourcefolder = sourcefolder + '/'
	if not outputfolder.endswith('/'):
		outputfolder = outputfolder + '/'
	# This just makes things easier.

	# Get a list of files.
	allthefiles = os.listdir(sourcefolder)

	# Now we have a list of file names. But we want volumeIDs, paired with complete
	# paths to the file. We're going to achieve the pairing by zipping two lists,
	# rather than with a dict, because ordering also matters here.

	volumeIDs = list()
	volumepaths = list()

	for filename in allthefiles:

		if filename.endswith(extension):
			volID = filename.replace(extension, "")
			# The volume ID is basically the filename minus its extension.
			# Extensions are likely to be long enough that there is little
			# danger of accidental occurrence inside a filename. E.g.
			# '.fic.tsv'
			path = sourcefolder + filename
			volumeIDs.append(volID)
			volumepaths.append(path)

	# Now we actually read volumes and create a training corpus, which will
	# be a list of bags of words.

	trainingset = list()
	for volID, filepath in zip(volumeIDs, volumepaths):
		volume = BagOfWords(filepath, volID, include_punctuation)
		# That reads the volume from disk.
		trainingset.append(volume)

	# We select the most common words as features.
	featurelist = select_common_features(trainingset, maxfeatures)
	numfeatures = len(featurelist)
	# Note that the number of features we actually got is not necessarily
	# the same as maxfeatures.

	for volume in trainingset:
		volume.selectfeatures(featurelist)
		volume.normalizefrequencies()
		# The volume now contains feature frequencies:
		# raw counts have been divided by the total number of words in the volume.

	standardizer = StandardizingVector(trainingset, featurelist)
	# This object calculates the means and standard deviations of all features
	# across the training set.

	listofvolumefeatures = list()
	for volume in trainingset:
		volume.standardizefrequencies(standardizer)
		# We have now converted frequencies to z scores. This is important for
		# regularized logistic regression -- otherwise the regularization
		# gets distributed unevenly across variables because they're scaled
		# differently.

		listofvolumefeatures.append(volume.features)

	# Now let's make a data frame by concatenating each volume as a separate column,
	# aligned on the features that index rows.

	data = pd.concat(listofvolumefeatures, axis = 1)
	data.columns = volumeIDs

	# Name the columns for volumes. Then transpose the matrix:

	data = data.T

	# So that we have a matrix with features (variables) as columns and instances (volumes)
	# as rows. Would have been easier to make this directly, but I don't know a neat
	# way to do it in pandas.

	classvector = epistolarymetadata.get_genrevector(volumeIDs, "nonepistolary / epistolary")
	# This part is going to be very specific to the model you train, so I've
	# encapsulated it in a separate module. For our purposes, it's just a function
	# that returns a pandas series of zeroes and ones indexed by volumeID.
	# zero = non, one = epistolary.

	logisticmodel = LogisticRegression(C = 1)
	classvector = classvector.astype('int')
	logisticmodel.fit(data, classvector)

	# Let's sort the features by their coefficient in the model, and print.

	coefficients = list(zip(logisticmodel.coef_[0], featurelist))
	coefficients.sort()
	for coefficient, word in coefficients:
		print(word + " :  " + str(coefficient))

	# Pickle and write the model & standardizer. This will allow us to apply the model to
	# new documents of unknown genre.

	modelfile = outputfolder + "logisticmodel.p"
	with open(modelfile, mode = 'wb') as f:
		pickle.dump(logisticmodel, f)
	standardizerfile = outputfolder + "standardizer.p"
	with open(standardizerfile, mode = 'wb') as f:
		pickle.dump(standardizer, f)

	accuracy_tries = cross_validation.cross_val_score(logisticmodel, data, classvector, cv=5)
	print(accuracy_tries)
def train_a_model(sourcefolder, extension, include_punctuation, maxfeatures,
                  outputfolder, classpath):

    if not os.path.exists(outputfolder):
        os.makedirs(outputfolder)

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'
    if not outputfolder.endswith('/'):
        outputfolder = outputfolder + '/'
    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)
    random.shuffle(allthefiles)

    # Now we have a list of file names. But we want volumeIDs, paired with complete
    # paths to the file. We're going to achieve the pairing by zipping two lists,
    # rather than with a dict, because ordering also matters here.

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    # Get the class vector, indexed by volume ID

    classvector = get_classvector(classpath, volumeIDs)
    assert len(classvector) == len(volumeIDs)

    # Now we actually read volumes and create a training corpus, which will
    # be a list of bags of words.

    trainingset = list()
    for volID, filepath in zip(volumeIDs, volumepaths):
        volume = BagOfWords(filepath, volID, include_punctuation)
        # That reads the volume from disk.
        trainingset.append(volume)

    # We select the most common words as features.
    featurelist = select_common_features(trainingset, maxfeatures)
    numfeatures = len(featurelist)
    # Note that the number of features we actually got is not necessarily
    # the same as maxfeatures.

    for volume in trainingset:
        volume.selectfeatures(featurelist)
        volume.normalizefrequencies()
        # The volume now contains feature frequencies:
        # raw counts have been divided by the total number of words in the volume.

    standardizer = StandardizingVector(trainingset, featurelist)
    # This object calculates the means and standard deviations of all features
    # across the training set.

    listofvolumefeatures = list()
    for volume in trainingset:
        volume.standardizefrequencies(standardizer)
        # We have now converted frequencies to z scores. This is important for
        # regularized logistic regression -- otherwise the regularization
        # gets distributed unevenly across variables because they're scaled
        # differently.

        listofvolumefeatures.append(volume.features)

    # Now let's make a data frame by concatenating each volume as a separate column,
    # aligned on the features that index rows.

    data = pd.concat(listofvolumefeatures, axis=1)
    data.columns = volumeIDs

    # Name the columns for volumes. Then transpose the matrix:

    data = data.T

    # So that we have a matrix with features (variables) as columns and instances (volumes)
    # as rows. Would have been easier to make this directly, but I don't know a neat
    # way to do it in pandas.

    logisticmodel = LogisticRegression(C=0.1)
    classvector = classvector.astype('int')
    logisticmodel.fit(data, classvector)

    # Let's sort the features by their coefficient in the model, and print.

    coefficients = list(zip(logisticmodel.coef_[0], featurelist))
    coefficients.sort()
    for coefficient, word in coefficients:
        print(word + " :  " + str(coefficient))

    # Pickle and write the model & standardizer. This will allow us to apply the model to
    # new documents of unknown genre.

    modelfile = outputfolder + "logisticmodel.p"
    with open(modelfile, mode='wb') as f:
        pickle.dump(logisticmodel, f)
    standardizerfile = outputfolder + "standardizer.p"
    with open(standardizerfile, mode='wb') as f:
        pickle.dump(standardizer, f)

    accuracy_tries = cross_validation.cross_val_score(logisticmodel,
                                                      data,
                                                      classvector,
                                                      cv=10)
    print(accuracy_tries)
    print(np.sum(accuracy_tries) / len(accuracy_tries))

    random.shuffle(classvector)
    print('\nASSVECTOR!\n')
    accuracy_tries = cross_validation.cross_val_score(logisticmodel,
                                                      data,
                                                      classvector,
                                                      cv=10)
    print(accuracy_tries)
    print(np.sum(accuracy_tries) / len(accuracy_tries))
def preprocess(data_path,
               is_testing,
               min_occurrences=5,
               cache_bow_output=None,
               cache_word2vec_output=None,
               duration=None):
    if duration:
        data = DataInitializer()
        data.initialize(data_path, is_testing, duration=duration)
    else:
        data = DataInitializer()
        data.initialize(data_path, is_testing)

    if os.path.isfile("data/BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    data = DataCleaning(data, is_testing)
    data.cleanup(DataCleaner(is_testing))

    if is_testing:
        print("Testing data shape:", data.processed_data.shape)
    else:
        print("Training data shape:", data.processed_data.shape)

    data = Sentiments(data)
    data.sentiment_analysis_by_text()
    print("First five rows with sentiment: ", data.processed_data.head())
    if is_testing:
        data.processed_data.to_csv("data/clean_test_with_sentiments.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)
    else:
        data.processed_data.to_csv("data/clean_train_with_sentiments.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)

    data = DataTokenize(data)
    data.tokenize()
    data.stem()

    data = WordList(data)
    data.build_wordlist(min_occurrences=min_occurrences)

    word2vec_data = data
    data = BagOfWords(data.processed_data, data.wordlist, is_testing)
    data.build_data_model()
    print("data model head: ", data.data_model.head(5))
    """
    Word 2 vec
    """

    word2vec = Word2VecProvider()

    # REPLACE PATH TO THE FILE
    word2vec.load("../twitter/data/glove.twitter.27B.200d.txt")

    word2vec_data = RedditData(word2vec_data)
    word2vec_data.build_final_model(word2vec)

    word2vec_data_model = word2vec_data.data_model
    if "index" in word2vec_data_model.columns:
        word2vec_data_model.drop("index", axis=1, inplace=True)
    word2vec_data_model.dropna(axis=0, inplace=True)
    word2vec_data_model.reset_index(inplace=True)
    word2vec_data_model.index = word2vec_data_model['timestamp_ms']
    print("final word2vec data model: \n", word2vec_data_model.head(), "\n")
    """
    Tokenizing the data
    """
    texts = []
    sentiments = []
    tokenized_data = pd.DataFrame()
    for text in data.processed_data["summary"]:
        texts.append(text)
    for sentiment in data.processed_data["sentiment"]:
        sentiments.append(sentiment)
    print("texts: ", texts[0:5])
    tokenizer = Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=200)

    print(
        "\n\n##################################################\npadded sequence head: \n",
        padded_sequences[0:5])
    print(
        "\n####################################################\n padded sequence length \n",
        len(padded_sequences))

    if not is_testing:
        data = Plotting(data)
        data.plot()

    if cache_bow_output is not None:
        data.data_model.to_csv(cache_bow_output,
                               index=False,
                               float_format="%.6f")
        word2vec_data_model.to_csv(cache_word2vec_output,
                                   index=False,
                                   float_format="%.6f")
        with open('sequences', 'wb') as fp:
            pickle.dump(padded_sequences, fp)
        with open('sentiments', 'wb') as fp:
            pickle.dump(sentiments, fp)

    return data.data_model, word2vec_data_model
示例#8
0
def preprocess(data_path,
               is_testing,
               min_occurrences=5,
               cache_bow_output=None,
               cache_word2vec_output=None,
               duration=None,
               sentiment_method=None):
    if duration and cache_bow_output and cache_word2vec_output:
        data = DataInitializer()
        data.initialize(data_path, is_testing, duration=duration)
    elif cache_bow_output and cache_word2vec_output:
        data = DataInitializer()
        data.initialize(data_path,
                        is_testing,
                        cache_bow_output=cache_bow_output,
                        cache_word2vec_output=cache_word2vec_output)
    else:
        data = DataInitializer()
        data.initialize(data_path, is_testing)

    if not os.path.isfile("data/Train_BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    if not os.path.isfile("data/Test_BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    data = DataCleaning(data, is_testing)
    data.cleanup(DataCleaner(is_testing))

    if is_testing:
        print("Testing data shape:", data.processed_data.shape)
    else:
        print("Training data shape:", data.processed_data.shape)

    data = Sentiments(data, sentiment_method=sentiment_method)
    data.sentiment_analysis_by_text()

    print("First five rows with sentiment: ", data.processed_data.head())
    if is_testing:
        data.processed_data.to_csv(
            "data/one_month_clean_test_data_with_prices.csv",
            sep=',',
            encoding='utf-8',
            index=False)
        # os.remove(data_path)
    else:
        data.processed_data.to_csv("data/one_month_clean_data_with_prices.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)

    if os.path.isfile(cache_word2vec_output):
        print("cache_word2vec_output file name: ", cache_word2vec_output)
        word2vec_data_model = pd.read_csv(cache_word2vec_output)
        data.data_model = pd.read_csv(cache_bow_output)
        print("data model head: ", data.data_model.head(5))
    else:
        data = DataTokenize(data)
        data.tokenize()
        data.stem()

        data = WordList(data)
        data.build_wordlist(min_occurrences=min_occurrences)

        word2vec_data = data
        data = BagOfWords(data.processed_data, data.wordlist, is_testing)
        data.build_data_model()
        print("data model head: ", data.data_model.head(5))
        """
        Word 2 vec
        """

        word2vec = Word2VecProvider()

        # REPLACE PATH TO THE FILE
        word2vec.load("data/glove.twitter.27B.200d-with2num.txt")
        word2vec_data = TwitterData(word2vec_data)
        word2vec_data.build_final_model(word2vec)
        word2vec_data_model = word2vec_data.data_model

        if "original_id" in word2vec_data_model.columns:
            word2vec_data_model.drop("original_id", axis=1, inplace=True)
        word2vec_data_model.dropna(axis=0, inplace=True)
        word2vec_data_model.reset_index(inplace=True, drop=True)
        word2vec_data_model.index = word2vec_data_model['timestamp']

    print("final word2vec data model: \n", word2vec_data_model.head(), "\n")

    # if not is_testing:
    #     data = Plotting(data)
    #     data.plot()

    if not is_testing:
        if not os.path.isfile("train_sequences"):
            print("\n##########################\n"
                  "Tokenizing the tweets\n"
                  "############################\n")
            texts = []
            sentiments = []
            tokenized_data = pd.DataFrame()

            for text in data.processed_data["text"]:
                texts.append(text)

            for sentiment in data.processed_data['sentiment']:
                sentiments.append(sentiment)

            print("texts: ", texts[0:5])
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(texts)
            sequences = tokenizer.texts_to_sequences(texts)
            padded_sequences = pad_sequences(sequences,
                                             maxlen=20,
                                             padding='post')
            padded_sequences = pd.DataFrame(data=padded_sequences)

            merged_train_data = pd.concat([
                padded_sequences, data.processed_data[[
                    "high", "low", "open", "quoteVolume", "volume",
                    "weightedAverage"
                ]]
            ],
                                          axis=1)
            train_targets = data.processed_data[["close"]]
            print("shape of merged train data: ", merged_train_data.shape)

            with open('data/train_sequences', 'wb') as fp:
                pickle.dump(merged_train_data, fp)
            with open('data/train_prices', 'wb') as fp:
                pickle.dump(train_targets, fp)

            # load the whole embedding into memory
            embeddings_index = dict()
            with open("data/glove.twitter.27B.200d-with2num.txt",
                      "r",
                      encoding="utf-8") as my_file:
                for line in my_file:
                    values = line.split()
                    word = values[0]
                    coefs = numpy.asarray(values[1:], dtype='float32')
                    embeddings_index[word] = coefs
            # f.close()
            print("*" * 80, "\n" * 10)
            print('Loaded %s train word vectors.' % len(embeddings_index))
            print('Total %s of word indexes.' % len(tokenizer.word_index))

            with open('data/embeddings_index', 'wb') as fp:
                pickle.dump(embeddings_index, fp)
            with open('data/train_word_indexes', 'wb') as fp:
                pickle.dump(tokenizer.word_index, fp)

            # encode class values as integers
            # encoder = LabelEncoder()
            # encoder.fit(sentiments)
            # encoded_sentiments = encoder.transform(sentiments)

            # convert integers to dummy variables (i.e. one hot encoded)
            # dummy_sentiments = np_utils.to_categorical(encoded_sentiments)

            # for text in data.processed_data.loc[data.processed_data['sentiment'] != 0, "text"]:
            #     texts.append(text)
            #
            # for sentiment in data.processed_data.loc[data.processed_data['sentiment'] != 0, "sentiment"]:
            #     sentiments.append(sentiment)

    else:
        if not os.path.isfile("test_sequences"):
            print("\n##########################\n"
                  "Tokenizing the tweets\n"
                  "############################\n")
            texts = []
            sentiments = []
            tokenized_data = pd.DataFrame()

            for text in data.processed_data["text"]:
                texts.append(text)

            for sentiment in data.processed_data['sentiment']:
                sentiments.append(sentiment)

            print("texts: ", texts[0:5])
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(texts)
            sequences = tokenizer.texts_to_sequences(texts)
            padded_sequences = pad_sequences(sequences,
                                             maxlen=20,
                                             padding='post')
            padded_sequences = pd.DataFrame(data=padded_sequences)

            merged_test_data = pd.concat([
                padded_sequences, data.processed_data[[
                    "high", "low", "open", "quoteVolume", "volume",
                    "weightedAverage"
                ]]
            ],
                                         axis=1)
            test_targets = data.processed_data[["close"]]
            print("shape of merged test data: ", merged_test_data.shape)

            with open('data/test_sequences', 'wb') as fp:
                pickle.dump(merged_test_data, fp)
            with open('data/test_prices', 'wb') as fp:
                pickle.dump(test_targets, fp)
            with open('data/test_word_indexes', 'wb') as fp:
                pickle.dump(tokenizer.word_index, fp)

            # padded_sequences = pd.DataFrame(data=padded_sequences)

    print(
        "\n\n##################################################\npadded sequence head: \n",
        padded_sequences[0:5])
    print(
        "\n####################################################\n padded sequence length \n",
        len(padded_sequences))

    if not os.path.isfile(train_data_word2vec_file_name) or not os.path.isfile(
            test_data_word2vec_file_name):
        if cache_bow_output is not None:
            data.data_model.to_csv(cache_bow_output,
                                   index=False,
                                   float_format="%.6f")
            word2vec_data_model.to_csv(cache_word2vec_output,
                                       index=False,
                                       float_format="%.6f")
    return data.data_model, word2vec_data_model
示例#9
0
文件: run.py 项目: xmli/cme193
	print 'Starting final assignment testing...'

	TEST(test_bow())
	TEST(test_regression_theta())
	TEST(test_regression_pred())

	yhat = False
	
	with open('labels.txt') as f:
		y = [int(l.strip()) for l in f.readlines()]

	with open('example_text.txt') as f:
		texts = [l.strip().lower() for l in f.readlines()]

	
	bow = BagOfWords(top_n=1500)

	
	bow.fit(texts)

	X = bow.transform(texts)

	#-CME193-START-------------------

	# !!!!!! PLEASE READ !!!!!!
	# you will need to create a new instance of LinearProbabilityModel, you will need
	# to call your .fit on X and y, and you will need to put your preductions 
	# (after calling .predict) in a variable called yhat.

	#-CME193-END---------------------