Exemplo n.º 1
0
__author__ = 'nparslow'

from compareCorrectedCorpus import getFinalTokenFormsAndTreesAndWeight

import calcPLex

print "loading freq info"
lemmacat2freqrank = calcPLex.loadLemmaCat2freqrank()


# testxmls = ["analysed_SpellChecker/entry_81/0/0/0/entry_81.E1.dep.xml"

testxmls = [
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E1.dep.xml",
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E5.dep.xml",
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E2.dep.xml",
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E6.dep.xml",
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E3.dep.xml",
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E7.dep.xml",
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E4.dep.xml",
    ]

wordforms = []
for testxml in testxmls:
    print "loading xml", testxml
    tok2finalforms, tok2lemmacats, verb2info, trees, (weight, maxweight) = getFinalTokenFormsAndTreesAndWeight(testxml)
    # each tok2lemmacat goes to a list to 2-tuples
    # print tok2lemmacats.values()
    for lemmacats in tok2lemmacats.values():
        wordforms.extend([x[0] + u"_" + x[1] for x in lemmacats])
def main(argv):
    # todo add a debug variable and optionfilename variable
    optionsfilename = "settings/optionsfile.txt"
    try:
        opts, args = getopt.getopt(argv,"hi:", ["in_options_file="])
    except getopt.GetoptError:
        print 'textExtractor.py -i <input options file>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'textExtractor.py -i <input options file>'
            sys.exit()
        elif opt in ("-i", "--in_options_file"):
            optionsfilename = arg 

    globalparams, variableparams = optionsFileReader.readOptionsFile(optionsfilename)
    filenames = allCorpusFiles( globalparams["origtextdir"] )
    print filenames
    # todo for the moment we just take the first element for these
    #meltDir = globalparams["melteddir"][0]
    #ddagDir = globalparams["ddageddir"][0]
    #frmgDir = globalparams["frmgeddir"][0]
    filenameResources = {
        "filename": None,
        "melteddir" : globalparams["melteddir"][0],
        "ddageddir" : globalparams["ddageddir"][0],
        "frmgeddir" : globalparams["frmgeddir"][0],
    }

    print globalparams
    outarffdir = globalparams["outdir"][0]
    corpusName = globalparams["corpusName"][0]
    headerInfo = globalparams["headerInfo"][0]
    lexiquefile = globalparams["lexiqueDict"][0]
    freq2ranksfile = globalparams["freq2ranks"][0]

    gensimModelFile = globalparams["word2vecmodel"][0]

    variableTypes = set([x[0] for x in variableparams])

    lemmacat2freqrank = {}
    if "PLex" in variableTypes or "S" in variableTypes or "altS" in variableTypes or "LFP" in variableTypes:
        print "loading freq info"
        lemmacat2freqrank = calcPLex.loadLemmaCat2freqrank(freq2ranksfile)

    word2vecModel = None
    if "w2vct" in variableTypes:
        print "loading word2vec model"
        word2vecModel = gensim.models.Word2Vec.load(gensimModelFile)

    nGramDict, nmoGramDict, totalcounts = {}, {}, 1000000000000000
    if "bigramLogProb" in variableTypes:
        print "loading bi-gram model"
        ngramModelFile = globalparams["bigrammodel"][0]
        nGramDict, nmoGramDict, totalcounts = nGramModel.getNgramDicts(ngramModelFile)

    lexiqueDict = {}
    if "syllablesPerWord" in variableTypes:
        loadLexiqueToDict(lexiquefile, lexiqueDict)

    # create a list of variables for the arff file:
    variables = [ ("filename", "string")] + \
        [ (variableAndParamsToString(vname, params), "numeric") for vname,params in variableparams] + \
        [ ("level", "numeric")] # note that level must go last!

    for x in variables:
        print x

    #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/C/Catja.txt"]
    #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/A/Arvid.txt"]
    #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/A/Amie4.txt"]
    #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/B/Bror2.txt"]
    filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/C/Caroline.txt"]
    filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/E/Eddy.txt"] # just to test an E
    filenames = ["/home/pinot/alpage/nparslow/Documents/Corpora/CEFLE/E/Eddy.txt"] # just to test an E


    resources = {
        "lemmacat2freqrank": lemmacat2freqrank,
        "word2vecModel": word2vecModel,
        "nGramDict": nGramDict,
        "nmoGramDict": nmoGramDict,
        "nGramCounts": totalcounts,
        "lexiqueDict": lexiqueDict
    }

    outputRows =[]


    allobservedtrees = set([])
    treespertext = []
    for filename in filenames:

        # we change the resouce filename with each round:
        filenameResources["filename"] = filename
        baseFileName = os.path.basename(filename)
        baseFileName, extension = os.path.splitext(baseFileName)
        filenameResources["ddagfile"] = os.path.join(filenameResources["ddageddir"],
                                                         os.path.basename(baseFileName) + ".ddag")

        print
        print filename
        #fname = os.path.join(baseDir, filename)
        #if "CEFLE" in baseDir:
        #    fname = os.path.join(baseDir, filename[0], filename)

        #text = getDocumentProperties(frmgDir, meltDir, ddagDir, fname, word2vecModel,
        #                             (nGramDict, nmoGramDict, totalcounts), debug=False)
        #text = getDocumentProperties(filenameResources, variables, word2vecModel,
        #                             (nGramDict, nmoGramDict, totalcounts), debug=False)
        text = getDocumentProperties(filenameResources, variableparams, debug=False)

        text.calcVariables( resources )

        for i in range(len(variableparams)+1):
            #variable, params = variableparams[i]
            varlabel = variables[i]
            # squeeze the filename in first:
            value = filename
            if i > 0:
                value = text.variablevalues[i-1]
            print varlabel, "\t", str(value)
        print 'level', "\t", text.level

        allobservedtrees.update(text.trees.keys())
        treespertext.append( (set(text.trees.keys()), text.level) )
        #print "mwpw", text.getMeanWeightPerWord()
        outputRows.append( [baseFileName] + text.variablevalues + [text.level] )


    savetoArff(outarffdir, corpusName, headerInfo, variables, outputRows )
    savetoArff(outarffdir, corpusName + "class", headerInfo, variables, outputRows, levelAsClass=True )

    arfftreefile = "testtrees"
    #corpusName = "test"
    #headerInfo = "a test corpus\n of stuff"
    treevariables = list(allobservedtrees)
    #print allobservedtrees
    #print treespertext
    treeoutputRows = [[1 if x in trees else 0 for x in treevariables] + [level] for trees, level in treespertext]
    treevariables.append("level")
    #print "ntrees", len(treevariables)
    #for a,b in zip(treevariables, treeoutputRows):
    #    print a,len(b), b

    savetoTreeArff(outarffdir, arfftreefile, headerInfo, treevariables, treeoutputRows, levelAsClass=True )