def skipthoughts_vectors(doc, sentences, encoder_decoder):
    Creates skipthoughts vector for doc and corpus for a given encoder_decoder

    The encode function produces an array of skipthought vectors with as many rows as there were sentences and 4800 dimensions.
    See the combine-skip section of the skipthoughts paper for a detailed explanation of the array.

        doc (str): the text of the document (before any normalization)
        corpus (list): the first and last sentences of each document in the corpus
        encoder_decoder (???): the skipthoughts encoder/decoder

        array: the concatenation of the corpus skipthoughts vector (the average of each indivdual skipthoughts vector) and the document skipthoughts vector (the average of the first and last sentence's skipthoughts vector)
    corpus_vectors = sk.encode(encoder_decoder, sentences)
    corpus_vector = np.mean(corpus_vectors, axis = 0)
    doc_vector = np.mean(sk.encode(encoder_decoder, get_first_and_last_sentence(doc)), axis=0)
    skipthoughts = [doc_vector, corpus_vector]
    return skipthoughts
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full,
                             mem_net_params, vocab, full_vocab, w2v_model,
    Generates observations to be fed into the mem_net code

        raw_doc (string): the raw document text
        raw_corpus (str): the raw corpus text
        sentences_full (list): list of all sentences in the corpus
        mem_net_params (dict): the specified features to be calculated for mem_net
        vocab (dict): the vocabulary of the data set
        w2v_model: the word2vec model of the data set
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors

        doc_input (array): the corpus data, known in mem_nets as the input
        doc_questions: the document data, known in mem_nets as the question
        doc_masks: the mask for the input data - tells mem_net where the end of each input is
            this can be per word for the end of a sentence

    # Use the specified mask mode where available
    if mem_net_params.get('mask_mode', False):
        mask_mode = mem_net_params["mask_mode"]
        mask_mode = 'sentence'

    if mem_net_params.get('embed_mode', False):
        embed_mode = mem_net_params['embed_mode']
        embed_mode = 'word2vec'

    if embed_mode == 'skip_thought':
        from src.featurizers.skipthoughts import skipthoughts as sk
        doc_sentences = tokenize.punkt_sentences(raw_doc)

        # Ensure that the document and corpus are long enough and if not make them be long enough
        if len(sentences_full) == 1:
            #print("short corpus")
        if len(doc_sentences) == 1:
            #print("short doc")
        corpus_vectors = sk.encode(encoder_decoder, sentences_full)
        doc_vectors = sk.encode(encoder_decoder, doc_sentences)

        # Since each entry is a sentence, we use the index of each entry for the mask
        # We cannot use a word mode in this embedding
        doc_masks = [index for index, w in enumerate(corpus_vectors)]
        doc_questions = doc_vectors
        doc_input = corpus_vectors

    elif embed_mode == 'onehot':
        min_length = None
        max_length = None
        if mem_net_params.get('onehot_min_len', False):
            min_length = mem_net_params['onehot_min_len']
        if mem_net_params.get('onehot_max_len', False):
            max_length = mem_net_params['onehot_max_len']
        onehot_vocab = full_vocab

        # Preprocess and tokenize bkgd documents
        corpus_tokens = tokenize.word_punct_tokens(
        corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab)
        corpus_indices = encode_doc(corpus_tokens, onehot_vocab)
        # Get sentence mask indices
        assert {'.', ',', '!', '?'} <= onehot_vocab.keys(
        )  # ensure that you are using a vocabulary w/ punctuation
        sentence_mask = get_mask(corpus_indices,
        # One-hot encode documents w/ masks, and query document
        corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab))
        corpus_vectors = run_onehot(corpus_encoded,
        # Tokenize and  one-hot encode query document
        doc_vectors = run_onehot(
            onehot_vocab, min_length, max_length)

        doc_questions = doc_vectors.T
        doc_input = corpus_vectors.T

        if mask_mode == 'sentence':
            doc_masks = sentence_mask
            doc_masks = [index for index, w in enumerate(doc_input)]

    elif embed_mode == 'word2vec':
        corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus,
                                                   mem_net_params, mask_mode)
        doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params,

        if len(corpus_vectors) > 0 and len(doc_vectors) > 0:
            doc_questions = doc_vectors
            doc_input = corpus_vectors

    return doc_input, doc_questions, doc_masks
