Exemplo n.º 1
0
 def __init__(self,configfile):
     self.configfile=configfile
     print "Reading configuration from "+self.configfile
     self.config=ConfigParser.RawConfigParser()
     self.config.read(self.configfile)
     self.myMenReader=MENReader(self.configfile)
     self.whereami=self.config.get('default','whereami')
     self.pos=ast.literal_eval(self.config.get('default','pos'))
     self.options=ast.literal_eval(self.config.get('default','options'))
     self.outputvectors=(self.config.get('default','outputvectors')=='True')
Exemplo n.º 2
0
class MENManager:

    posmap={"N":"nouns","V":"verbs","J":"adjs","R":"advs"}
    INCLUDE_PREFIX="_is_included_"

    def __init__(self,configfile):
        self.configfile=configfile
        print "Reading configuration from "+self.configfile
        self.config=ConfigParser.RawConfigParser()
        self.config.read(self.configfile)
        self.myMenReader=MENReader(self.configfile)
        self.whereami=self.config.get('default','whereami')
        self.pos=ast.literal_eval(self.config.get('default','pos'))
        self.options=ast.literal_eval(self.config.get('default','options'))
        self.outputvectors=(self.config.get('default','outputvectors')=='True')


    def get_vector_name(self,pos):
        self.myname=self.config.get(self.whereami,'parentdir')+self.config.get('default','filename')
        mini = self.config.get('default','minorder')
        maxi = self.config.get('default','maxorder')
        if mini == "X":
            self.minorder=0
            self.maxorder=2
            self.reducedstring=""
        else:
            self.minorder=int(mini)
            self.maxorder=int(maxi)
            self.reducedstring=".reduce_"+str(mini)+"_"+str(maxi)
        self.suffix="."+pos+self.reducedstring+".filtered"
        self.normalised=(self.config.get('default','normalised')=="True")
        if self.normalised:
            self.suffix+=".norm"
        self.weighting=ast.literal_eval(self.config.get('default','weighting'))
        self.wthreshold=ast.literal_eval(self.config.get('default','wthreshold'))
        self.cds=ast.literal_eval(self.config.get('default','cds'))
        self.saliency=ast.literal_eval(self.config.get('default','saliency'))
        return self.myname+self.suffix

    def _is_included_N(self,token):
        lemma = token.split("/")[0]
        #print token,lemma
        if lemma in self.nounlist.keys():
            self.nounlist[lemma]=1
            return True
        else:
            return False

    def _is_any(self,token):
        return True

    def generate_simengine(self,pos="N"):
        print "Generating SimEngine"
        filenames={}
        filenames[pos]=self.get_vector_name(MENManager.posmap[pos])
        try:
            #self.mySimEngine=SimEngine(filenames,getattr(self,MENManager.INCLUDE_PREFIX))
            self.mySimEngine=SimEngine(filenames,self._is_included_N)
        except:
            print "Fatal Error: Unable to generate simEngine"
            print filenames
            print pos
            exit(-1)
        print "Successfully generated SimEngine and loaded vectors"


    def getvectorstream(self,pos,cds,wt,w,cons):
        if self.outputvectors:
            filename=self.get_vector_name(MENManager.posmap[pos])
            if cds:
                filename+="_cds"
            filename+="_"+wt
            filename+="_shift"+str(w)
            filename+="_cs"+str(cons)
            outstream=open(filename,"wb")
            return outstream
        else:
            return None

    def run_reweight(self):
        for pos in self.pos:
            print "Generating SimEngine"
            filenames={}
            filenames[pos]=self.get_vector_name(MENManager.posmap[pos])
            try:
                #self.mySimEngine=SimEngine(filenames,getattr(self,MENManager.INCLUDE_PREFIX))
                self.mySimEngine=SimEngine(filenames,self._is_any)
            except:
                print "Fatal Error: Unable to generate simEngine"
                print filenames
                print pos
                exit(-1)
            print "Successfully generated SimEngine and loaded vectors"
            weighting=[]
            for cds in self.cds:
                if cds=='True':
                    weighting.append('smooth_ppmi')
                for wt in self.weighting:

                    for w in self.wthreshold:
                        for cons in self.saliency:
                            print "Reweighting vectors"

                            self.mySimEngine.reweight(pos,weighting=[wt]+weighting,ppmithreshold=float(w),saliency=cons,outstream=self.getvectorstream(pos,cds,wt,w,cons))

    def run_MEN(self):
        self.myMenReader.readfile()
        self.tokenlists={}
        for pos in self.pos:
            self.nounlist={}
            for e in self.myMenReader.getEntryList(pos):
                self.nounlist[e]=0
            print "Need to load %s vectors "%(str(len(self.nounlist)))
            #print self.tokenlists[pos]
            self.generate_simengine(pos)
            missed=[x for x in self.nounlist.keys() if self.nounlist[x]==0]
            print "Not found: ", missed

            results=[]
            weighting=[]
            resultsstream=open("men2.out","wb")
            resultsstream.write("Starting MEN evaluation at: "+time.strftime("%c")+"\n")
            resultsstream.close()
            for cds in self.cds:
                if cds=='True':
                    weighting.append('smooth_ppmi')
                for wt in self.weighting:

                    for w in self.wthreshold:
                        for cons in self.saliency:
                            print "Reweighting vectors"

                            self.mySimEngine.reweight(pos,weighting=[wt]+weighting,ppmithreshold=float(w),saliency=cons,outstream=self.getvectorstream(pos,cds,wt,w,cons))
                            self.myMenReader.updateAutoSims(self.mySimEngine.selectedSims(self.myMenReader.getPairList(pos)))
                            res=(cds,wt,w,cons,self.myMenReader.triples.correlate(show_graph=False))

                            print res
                            resultsstream=open("men2.out","ab")
                            resultsstream.write(res)
                            resultsstream.write("\n")
                            resultsstream.close()
                            results.append(res)

            print "Summary of results for ",self.weighting
            for res in results:
                print res[0],res[1],res[2],res[3],res[4]
            resultsstream=open("men2.out","ab")
            resultsstream.write("Ending MEN evaluation at: "+time.strftime("%c")+"\n")
            resultsstream.close()

    def run(self):
        if "MEN" in self.options:
            self.run_MEN()
        elif "reweight" in self.options:
            self.run_reweight()
        else:
            print "Unknown options: ",self.options