#Load the dataset that was obtained from API and pickeled beforehand
with open('dataset.pkl', 'rb') as handle:
    dataset = pickle.load(handle)

#Extract all questions tokenized as sentences from the dataset
sentences = []
for i in dataset:
    sentences += nltk.sent_tokenize(i['question1'])
    sentences += nltk.sent_tokenize(i['question2'])

#Tokenize as words and find their frequency and select top 10000
word_dist = nltk.FreqDist()
for s in sentences:
    word_dist.update([i.lower() for i in nltk.word_tokenize(s)])

word_dist = word_dist.most_common(10000)

#obtain the glove vectors for the 10000 words from the web service
embeddings_list = []
for i in range(10, 100):
    embeddings_list += client.w2v(
        [i[0] for i in word_dist[i * 100:i * 100 + 100]])

#with the word as the key and the glove vector as the value and pickle it
embeddings_index = {}
for i in embeddings_list:
    embeddings_index[i['word']] = i['vec']

with open('embeddings_index1.pkl', 'wb') as handle:
    pickle.dump(embeddings_index, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # you can access individual fields such as "summary" and "rating" that can be used
    # to find the sentiment
    for item in val[:100]:
        print("Summary ==> ", item["summary"], "\t\tRating ==> ",
              item["rating"])

    # summary may be one or more sentences of text. We need to break these in to words
    # further, we need to convert each word to a vector form
    # our web service provides a function that accepts a list of words and returns the
    # corresponding vectors. In the example below, we take the first item returned by the
    # previous call and convert that in to a sequence of vectors
    text = val[0]["summary"]
    print("The input text is: ", text)

    # get sentence tokens from text that may have more than 1 sentence
    # we use NLTK's sent_tokenize for this
    sentences = sent_tokenize(text)  # we get the list of sentences
    all_words = []
    for sentence in sentences:
        all_words.extend(word_tokenize(sentence))

    # all_words contains all the words in the text as a single list
    # let us get the vectors for these
    vals = client.w2v(all_words)
    for val in vals:
        print(val["word"], val["vec"])

    # now you can continue further by vectoring the class label and creating the required dataset
    # your code ......