__author__ = 'nparslow' from compareCorrectedCorpus import getFinalTokenFormsAndTreesAndWeight import calcPLex print "loading freq info" lemmacat2freqrank = calcPLex.loadLemmaCat2freqrank() # testxmls = ["analysed_SpellChecker/entry_81/0/0/0/entry_81.E1.dep.xml" testxmls = [ "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E1.dep.xml", "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E5.dep.xml", "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E2.dep.xml", "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E6.dep.xml", "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E3.dep.xml", "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E7.dep.xml", "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E4.dep.xml", ] wordforms = [] for testxml in testxmls: print "loading xml", testxml tok2finalforms, tok2lemmacats, verb2info, trees, (weight, maxweight) = getFinalTokenFormsAndTreesAndWeight(testxml) # each tok2lemmacat goes to a list to 2-tuples # print tok2lemmacats.values() for lemmacats in tok2lemmacats.values(): wordforms.extend([x[0] + u"_" + x[1] for x in lemmacats])
def main(argv): # todo add a debug variable and optionfilename variable optionsfilename = "settings/optionsfile.txt" try: opts, args = getopt.getopt(argv,"hi:", ["in_options_file="]) except getopt.GetoptError: print 'textExtractor.py -i <input options file>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'textExtractor.py -i <input options file>' sys.exit() elif opt in ("-i", "--in_options_file"): optionsfilename = arg globalparams, variableparams = optionsFileReader.readOptionsFile(optionsfilename) filenames = allCorpusFiles( globalparams["origtextdir"] ) print filenames # todo for the moment we just take the first element for these #meltDir = globalparams["melteddir"][0] #ddagDir = globalparams["ddageddir"][0] #frmgDir = globalparams["frmgeddir"][0] filenameResources = { "filename": None, "melteddir" : globalparams["melteddir"][0], "ddageddir" : globalparams["ddageddir"][0], "frmgeddir" : globalparams["frmgeddir"][0], } print globalparams outarffdir = globalparams["outdir"][0] corpusName = globalparams["corpusName"][0] headerInfo = globalparams["headerInfo"][0] lexiquefile = globalparams["lexiqueDict"][0] freq2ranksfile = globalparams["freq2ranks"][0] gensimModelFile = globalparams["word2vecmodel"][0] variableTypes = set([x[0] for x in variableparams]) lemmacat2freqrank = {} if "PLex" in variableTypes or "S" in variableTypes or "altS" in variableTypes or "LFP" in variableTypes: print "loading freq info" lemmacat2freqrank = calcPLex.loadLemmaCat2freqrank(freq2ranksfile) word2vecModel = None if "w2vct" in variableTypes: print "loading word2vec model" word2vecModel = gensim.models.Word2Vec.load(gensimModelFile) nGramDict, nmoGramDict, totalcounts = {}, {}, 1000000000000000 if "bigramLogProb" in variableTypes: print "loading bi-gram model" ngramModelFile = globalparams["bigrammodel"][0] nGramDict, nmoGramDict, totalcounts = nGramModel.getNgramDicts(ngramModelFile) lexiqueDict = {} if "syllablesPerWord" in variableTypes: loadLexiqueToDict(lexiquefile, lexiqueDict) # create a list of variables for the arff file: variables = [ ("filename", "string")] + \ [ (variableAndParamsToString(vname, params), "numeric") for vname,params in variableparams] + \ [ ("level", "numeric")] # note that level must go last! for x in variables: print x #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/C/Catja.txt"] #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/A/Arvid.txt"] #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/A/Amie4.txt"] #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/B/Bror2.txt"] filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/C/Caroline.txt"] filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/E/Eddy.txt"] # just to test an E filenames = ["/home/pinot/alpage/nparslow/Documents/Corpora/CEFLE/E/Eddy.txt"] # just to test an E resources = { "lemmacat2freqrank": lemmacat2freqrank, "word2vecModel": word2vecModel, "nGramDict": nGramDict, "nmoGramDict": nmoGramDict, "nGramCounts": totalcounts, "lexiqueDict": lexiqueDict } outputRows =[] allobservedtrees = set([]) treespertext = [] for filename in filenames: # we change the resouce filename with each round: filenameResources["filename"] = filename baseFileName = os.path.basename(filename) baseFileName, extension = os.path.splitext(baseFileName) filenameResources["ddagfile"] = os.path.join(filenameResources["ddageddir"], os.path.basename(baseFileName) + ".ddag") print print filename #fname = os.path.join(baseDir, filename) #if "CEFLE" in baseDir: # fname = os.path.join(baseDir, filename[0], filename) #text = getDocumentProperties(frmgDir, meltDir, ddagDir, fname, word2vecModel, # (nGramDict, nmoGramDict, totalcounts), debug=False) #text = getDocumentProperties(filenameResources, variables, word2vecModel, # (nGramDict, nmoGramDict, totalcounts), debug=False) text = getDocumentProperties(filenameResources, variableparams, debug=False) text.calcVariables( resources ) for i in range(len(variableparams)+1): #variable, params = variableparams[i] varlabel = variables[i] # squeeze the filename in first: value = filename if i > 0: value = text.variablevalues[i-1] print varlabel, "\t", str(value) print 'level', "\t", text.level allobservedtrees.update(text.trees.keys()) treespertext.append( (set(text.trees.keys()), text.level) ) #print "mwpw", text.getMeanWeightPerWord() outputRows.append( [baseFileName] + text.variablevalues + [text.level] ) savetoArff(outarffdir, corpusName, headerInfo, variables, outputRows ) savetoArff(outarffdir, corpusName + "class", headerInfo, variables, outputRows, levelAsClass=True ) arfftreefile = "testtrees" #corpusName = "test" #headerInfo = "a test corpus\n of stuff" treevariables = list(allobservedtrees) #print allobservedtrees #print treespertext treeoutputRows = [[1 if x in trees else 0 for x in treevariables] + [level] for trees, level in treespertext] treevariables.append("level") #print "ntrees", len(treevariables) #for a,b in zip(treevariables, treeoutputRows): # print a,len(b), b savetoTreeArff(outarffdir, arfftreefile, headerInfo, treevariables, treeoutputRows, levelAsClass=True )