def getVector( word2vecModel, sentence ):

    totalvec = np.zeros( word2vecModel.layer1_size ) # np zeros?
    costhetaTotals = []
    costhetaPrevs = []
    # loop over tokens
    lastvec = None
    for i_token in range(len(sentence)):
        token = sentence[i_token]
        vec = None
        # if token in vocab, use word2vec of it
        if token in word2vecModel.vocab:
            vec = word2vecModel[token]

        # if not, get its type then use word2vec on it
        else:
            wordtype = createKnownDict.categoriseWord(token)
            #print wordtype
            #print wordtype in word2vecModel.vocab
            if wordtype in word2vecModel.vocab:
                vec = word2vecModel[wordtype]
            else:
                vec = word2vecModel["_UKN"]
        #print "vec", vec
        #print "tot", totalvec
        totalvec += vec
        costhetaTotal = cosTheta(vec, totalvec)
        costhetaTotals.append(costhetaTotal)
        if i_token > 0:
            costhetaPrev = cosTheta(vec, lastvec)
            costhetaPrevs.append(costhetaPrev)
        # add to the total
        totalvec += vec
        lastvec = vec

    # calculate averages
    meanWordSentCosTheta = np.mean(costhetaTotals, axis=0)
    # as a sentence may have only 1 word: (this will be nan otherwise)
    meanWordWordCosTheta = 1.0
    if len(sentence) > 1:
        meanWordWordCosTheta = np.mean(costhetaPrevs, axis=0)
    #print "avg", meanWordSentCosTheta, meanWordWordCosTheta



    # return the total vector
    return totalvec, meanWordSentCosTheta, meanWordWordCosTheta
def main():

    n = 3

    corpusdirectory = "/home/nparslow/Documents/AutoCorrige/SpellChecker/sxpipeTokenisedWiki/"
    #corpusdirectory = "/home/nparslow/Documents/AutoCorrige/SpellChecker/sxpipeTokenisedWiki/divided/"

    #corpusfilename = "mi-frwiki1" # half wiki
    corpusfilename = "frwiki_net.tok" # full wiki
    #corpusfilename = "mini_frwiki_net_train.tok" # first 50k lines of dev part
    #corpusfilename = "frwiki_net_dev.tok" # 7,000,000 lines
    #corpusfilename = "editions.tok"

    fullfilename = os.path.join(corpusdirectory, corpusfilename)


    word2count = {}
    replacedwords = {}

    # we have to read twice, once to create the dictionary of known v unknown words:
    with codecs.open(fullfilename, encoding='utf-8') as corpusfile:
        word2count = createKnownDict.constructVocabularyStage1(corpusfile)

    # we do a loop first to divide up tokens which are rare but divisible:
    token2subtokens = {}
    for word in word2count.keys():
        if word2count[word].count < MINCOUNT:
            createKnownDict.secondaryTokenise(word, word2count, token2subtokens, MINCOUNT)

    # adjust the counts:
    #print token2subtokens
    createKnownDict.adjustWord2Count(word2count, token2subtokens)

    # now replace any remaining tokens with insufficient counts by some category term:
    for word in word2count.keys():

        if word2count[word].count >= MINCOUNT:
            #print "adding 1"
            #totaltokens += word2count[word]
            pass
        else:
            # i.e. less than MINCOUNT
            #print x
            replacementword = createKnownDict.categoriseWord(word)
            createKnownDict.adjustWord2CountReplacement(word2count, word, replacementword)
            if replacementword not in replacedwords: replacedwords[replacementword] = []
            replacedwords[replacementword].append(word)
            #toolowtypes +=1
            #toolowtokens += word2count[word]

        #print runningtotal, totaltokens
        #if totaltokens > runningtotal + 10: break

    print "final vocab size:", sum([x.count for x in word2count.itervalues()])
    #print "tokens:", totaltokens, "types:", len(word2count)
    #print "too low tokens:", totaltokens, "too low types:", toolowtypes
    #print "fraction too low tokens", 1.0*toolowtokens/totaltokens
    #print "fraction too low types", 1.0*toolowtypes/len(word2count)

    if "" in word2count: print "empty string in vocab, something went wrong!"
    if " " in word2count: print "space in vocab, something went wrong!"

    # now we can loop a second time and get the n-grams (now we know which words to treat or to subtokenise)
    nGramsDict = fileToNgrams(fullfilename, word2count, token2subtokens, mincount=MINCOUNT, n=n)

    # now save a vocab file with low freq words replaced by _UNK
    # one word per line
    vocabfilename = "ngram_wiki_vocab.json"
    replaceFileName = "ngram_wiki_replace.json"
    nGramFileName = "ngram_wiki_count.txt"
    if n == 2:
        vocabfilename = "bigram_wiki_vocab.json"
        replaceFileName = "bigram_wiki_replace.json"
        nGramFileName = "bigram_wiki_count.txt"
    elif n == 3:
        vocabfilename = "trigram_wiki_vocab.json"
        replaceFileName = "trigram_wiki_replace.json"
        nGramFileName = "trigram_wiki_count.txt"

    with codecs.open(corpusdirectory + vocabfilename, mode='w', encoding='utf8') as vfile:
        # gensim Vocab object is not serialisable
        outdict = dict([(x,y.count) for (x,y) in word2count.iteritems()] )
        json.dump(outdict, vfile)
    with codecs.open(corpusdirectory + replaceFileName, mode='w', encoding='utf8') as rfile:
        json.dump(token2subtokens, rfile)
    with codecs.open(os.path.join(corpusdirectory, nGramFileName), mode='w', encoding='utf8') as nfile:
        for nGram in nGramsDict:
            # one n-gram per line, n-gram is separated by spaces, count by a tab
            nfile.write( " ".join(nGram) + "\t" + str(nGramsDict[nGram]) + "\n")

    # save lists of replaced to check if good groupings
    replacedFileName = "ngram_replaced.json"
    with codecs.open(corpusdirectory + replacedFileName, 'w', encoding='utf-8') as replfile:
        json.dump(replacedwords, replfile)