import gensim
import getCohesionVariables
import readDdag

__author__ = 'nparslow'

gensimModelFile = '/home/nparslow/Documents/AutoCorrige/SpellChecker/' \
                      'word2vec_models/orig/gensim_output/fullwiki_withcats.word2vec'
print "loading word2vec model"
word2vecModel = gensim.models.Word2Vec.load(gensimModelFile)

ddagfile = "/home/nparslow/Documents/AutoCorrige/Corpora/ddaged_CORPUS_CEFLE/Edna.ddag"
ddagfile = "/home/nparslow/Documents/AutoCorrige/Corpora/ddaged_CORPUS_CEFLE/Berthold.ddag"
ddagfile = "/home/nparslow/Documents/AutoCorrige/Corpora/ddaged_CORPUS_CEFLE/Crysmynta.ddag"


sentences = readDdag.readDdag(ddagfile)

a,b,c,d = getCohesionVariables.getCohesionVariables(word2vecModel, sentences)
예제 #2
0
 def getVariable(self, variable, params, resources ):
     if variable == "paragraphs":
         return len(self.paragraphStarts)
     elif variable == "sentences":
         return len(self.sentences)
     elif variable == "words":
         return self.getNWords()
     elif variable == "sentsPerPara":
         #print "sents per para", params[0]
         return getDiffStatistic( self.paragraphStarts +[len(self.sentences)], params[0] )
     elif variable == "wordsPerSent":
         return getCountStatistic( [len(x.uniquetokens) for x in self.sentences], params[0] )
     elif variable == "lettersPerWord":
         #print [y for x in self.sentences for y in x.uniquetokens]
         # we need the if requirement to remove punctuation etc.
         return getCountStatistic( [len(re.findall(ur'[^\W\d_]', y, flags=re.UNICODE))
                                    for x in self.sentences for y in x.uniquetokens
                                    if len(re.findall(ur'[^\W\d_]', y, flags=re.UNICODE)) > 0], params[0] )
     elif variable == "syllablesPerWord":
         #print "syllables per word", [x.listSyllablesPerWord() for x in self.sentences]
         return getCountStatistic( flatten2LevelList([x.listSyllablesPerWord() for x in self.sentences]), params[0] )
     elif variable == "PLex":
         # as the fit can crash
         #print "plexing"
         try:
             popt, pcov = calcPLex.calcPLex( self.lemmacats, resources["lemmacat2freqrank"], difficultRank=params[0])
             #print "plex", popt, pcov
             return popt[0]
         except:
             print "P_Lex fit problem"
             return 0.0
     elif variable == "S":
         popt, pcov = calcPLex.calcS( self.lemmacats, resources["lemmacat2freqrank"])
         return popt[0]
     elif variable == "altS":
         #return self.getAltSValues( resources["lemmacat2freqrank"] )
         popt, pcov = calcPLex.calcAB( self.lemmacats, resources["lemmacat2freqrank"])
         return popt
     elif variable == "vocd":
         if len(self.lemmacats) >= 50:
             #print "vocd", self.lemmacats
             #print  calcPLex.getVOCD(self.lemmacats), calcPLex.calcVOCD(self.lemmacats)
             return calcPLex.calcVOCD(self.lemmacats)
         else:
             return -1.0
     elif variable == "mtld":
         return calcPLex.calcMTLD(self.lemmacats, params[0])
     elif variable == "hdd":
         if len(self.lemmacats) >= 42:
             return calcHDD.calcHDD(self.lemmacats)
         return 0.0
     elif variable == "LFP":
         #vocabs, vocabOther, vocabUnk =\
         #print "lfp:", params
         #print calcPLex.calcLFP( self.lemmacats, resources["lemmacat2freqrank"], difficultybins = params)
         return calcPLex.calcLFP( self.lemmacats, resources["lemmacat2freqrank"], difficultybins = params)
     elif variable == "spellcorr":
         return 1.0* sum([x.spellingcorrections for x in self.sentences])/self.getNWords()
     elif variable == "meltdiff":
         return 1.0* sum([x.meltdiffs for x in self.sentences])/self.getNWords()
     elif variable == "meanmelt": # gets the geometic mean:
         return math.pow( product(flatten2LevelList([x.meltconfidences for x in self.sentences])), 1.0/self.getNWords())
     elif variable == "parsed":
         # need a param either 'full', 'corrected' or 'robust'
         #print "parsed:", params[0], type(params[0])
         #print [x.parsed for x in self.sentences]
         #print [type(x.parsed) for x in self.sentences]
         return 1.0* len( [x.parsed for x in self.sentences if x.parsed == params[0]])/len(self.sentences)
     elif variable == "weightPerWord":
         return 1.0* sum([x.weightperword*len(x.uniquetokens) for x in self.sentences])/self.getNWords()
     elif variable == "verb":
         #print params[0]
         #print self.vanalysis.keys()
         if params[0] in self.vanalysis:
             return 1.0*self.vanalysis[params[0]]/self.vgroups
         else:
             return 0.0
     elif variable == "clause":
         #print "CLAUSE", params[0], self.vanalysis[params[0]]
         #print "CLAUSE", self.vanalysis.keys()
         if params[0] in self.vanalysis:
             return 1.0*self.vanalysis[params[0]]/len(self.sentences)
         else:
             return 0.0
     elif variable == "w2vct":
         return getCohesionVariables.getCohesionVariables(resources["word2vecModel"], self.ddagSentences)
     elif variable == "treeTypesPerSent":
         return getCountStatistic( [len(x.trees.keys()) for x in self.sentences], *params )
     elif variable == "TreeTypesHDD":
         #print "hdd", self.trees.values()
         return calcHDD.calcHDDfromFreq(self.trees.values())
     elif variable == "TreeTypesYuleK":
         #print "yulek", self.trees.values()
         return calcHDD.calcYuleKfromFreq(self.trees.values())
     elif variable == "noVerbSentences":
         return 1.0*len([x.hasnomainverb for x in self.sentences if x.hasnomainverb > 0])/len(self.sentences)
     elif variable == "toksBeforeMainVerb":
         return 1.0*sum([x.wordsbeforemainverb for x in self.sentences if x.wordsbeforemainverb >= 0])/ \
                len([x.wordsbeforemainverb for x in self.sentences if x.wordsbeforemainverb >= 0])
     # currently no sentence boundaries:
     elif variable == "bigramLogProbs":
         return nGramModel.analyseTokens(flatten2LevelList(self.ddagSentences),
                                         resources["nGramDict"], resources["nmoGramDict"], resources["nGramCounts"])