Пример #1
0
def prepareForTest(dataset_path="~/tweetnet/data/text_data.pkl"):
    text_data = pickle.load(open(expanduser(dataset_path)))
    testTweets, testHashtags, testMw, testTweetSequence, testHashtagSequence, testMwSequence, testStartIdx = text_data[0],text_data[1],text_data[2],text_data[3],text_data[4],text_data[5],text_data[6]

    dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl")))
    htDic = createHtDict(dictionary, testHashtags)
    return htDic, testTweets, testHashtags, testMw, testTweetSequence, testHashtagSequence, testMwSequence, testStartIdx
Пример #2
0
nTestData = len(testTweets)
nTrainData = len(trainTweets)
nTestSequences = len(testTweetSequence)
nTrainSequences = len(trainTweetSequence)
print "Number of testing sequences: ", nTestSequences
print "Number of training sequences: ", nTrainSequences
print "Number of testing tweets: ", nTestData
print "Number of training tweets: ", nTrainData

#for i in range(1000):
#    print (trainTweetSequence[i], trainHashtagSequence[i])
# Load word2vec dictionary
dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl")))

# Create the hashtag dictionary
htDic = createHtDict(dictionary, testHashtags)

numEpochs = 50

lamb = 0.0001
#building cLSTM model
#print("\n")
print("Start building model ....")
model = Sequential()

#model.add(LSTM(numHiddenFirst, return_sequences=True, input_shape=(sequenceLength, inputSize)))

model.add(LSTM(numHiddenFirst, input_shape=(sequenceLength, inputSize)))

#model.add(BatchNormalization())
dictionary = pickle.load(
    open(expanduser("~/tweetnet/data/word2vec_dict.pkl"), "rb"))

data = numpy.zeros([len(hashtags), 300])
label = numpy.zeros([len(hashtags), 300])
inputStringLabel = []
outputStringLabel = []
for i in range(len(hashtags)):
    line = hashtags[i]
    listHashtag = line.split()
    data[i, :] = dictionary[listHashtag[1]]
    label[i, :] = dictionary[listHashtag[2]]
    inputStringLabel.append(listHashtag[1])
    outputStringLabel.append(listHashtag[2])

htDic = createHtDict(dictionary, outputStringLabel)

# Train and Test split
trainPercent = 0.99
nTrainData = numpy.round(len(data) * trainPercent).astype(int)
topN = 10
nEpoch = 5000
logAllPredictions = True
trainData = data[0:nTrainData]
testData = data[nTrainData + 1:]
testInputStringLabel = inputStringLabel[nTrainData + 1:]
print testData.shape
trainLabel = label[0:nTrainData]
testOutputStringLabel = outputStringLabel[nTrainData + 1:]

model = Sequential()