# testxmls = ["analysed_SpellChecker/entry_81/0/0/0/entry_81.E1.dep.xml" testxmls = [ "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E1.dep.xml", "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E5.dep.xml", "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E2.dep.xml", "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E6.dep.xml", "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E3.dep.xml", "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E7.dep.xml", "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E4.dep.xml", ] wordforms = [] for testxml in testxmls: print "loading xml", testxml tok2finalforms, tok2lemmacats, verb2info, trees, (weight, maxweight) = getFinalTokenFormsAndTreesAndWeight(testxml) # each tok2lemmacat goes to a list to 2-tuples # print tok2lemmacats.values() for lemmacats in tok2lemmacats.values(): wordforms.extend([x[0] + u"_" + x[1] for x in lemmacats]) print wordforms print "tokens", len(wordforms) print "types", len(set(wordforms)) popt, pcov = calcPLex.calcPLex( wordforms, lemmacat2freqrank) print "popt", popt print "pcov", pcov print "S" popt, pcov = calcPLex.calcS( wordforms, lemmacat2freqrank)
import compareCorrectedCorpus __author__ = 'nparslow' for fnum in range(1, 18): xmlfile = "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_CORPUS_CEFLE/Billy4/0/0/0/Billy4.E" + str(fnum) + ".dep.xml" finaltokens, origlemmacats, origverb2info, origtrees, origweight, wordsbeforemainverb =\ compareCorrectedCorpus.getFinalTokenFormsAndTreesAndWeight(xmlfile) print fnum, wordsbeforemainverb
def getNextSentenceFromFiles( processedDepXMLFile, processedTokenFile, processedLogFile, debug=False ): #print processedSentenceFile sentence = None if os.path.isfile(processedDepXMLFile): sentence = Sentence() tree = ET.parse(processedDepXMLFile) # 'W' nodes are 'words' which can include multiple tokens, e.g. 'bien que' is one word # .iter for recursive, .findall for depth of 1 # id the cluster then get the lex element from the cluster (we'll process it later) wordsforms = [(x.attrib['lemma'], x.attrib['form'], x.attrib['cluster'], fixMixedEncodings(tree.findall("cluster[@id='" + x.attrib['cluster']+"']")[0].attrib["lex"])) for x in tree.iter('node') if len(x.get('lemma'))>0 and x.get('lemma') != "_EPSILON"] print "wordsforms" print wordsforms # correct the encodings and remove epsilons tokens = getTokensFromFile(processedTokenFile) if len(wordsforms) > 0: # sentence was at least partially parsed #print "sorted word forms:" #print sorted(wordsforms, key=lambda x: x[2].split('_')[1]) # sort by the start point of the token #re.sub(ur'E\d+F\d+\|', u'', x[3], flags=re.UNICODE) sentence.tokens = [Token(x[0], x[1], re.findall(ur'(?<=[F\d]\d\|)[^ ]+', x[3], flags=re.UNICODE), [int(x) for x in re.findall(ur'(?<=F)\d+(?=\|)', x[3], flags=re.UNICODE) ]) for x in sorted(wordsforms, key=lambda x: int(x[2].split('_')[1]))] #sentence.forms = [x[1] for x in sorted(wordsforms, key=lambda x: x[2].split('_')[1])] else: # sentence wasn't parsed, so use the tokens file: #tokens = getTokensFromFile(processedTokenFile) print "using tokens***" sentence.tokens = [Token(None, None, tokens[i], i) for i in range(len(tokens))] #print sentence.tokens #print "obs forms:", [x.observedform for x in sentence.tokens] tok2finalforms, tok2lemmacats, verb2info, trees, (weight, minweight) = \ compareCorrectedCorpus.getFinalTokenFormsAndTreesAndWeight(processedDepXMLFile) sentence.setLemmaCats(tok2lemmacats) sentence.weightperword = weight sentence.minweight = minweight verbAnalysis = classifyVerbs.classifyVerbs(verb2info) sentence.vsingle = 0 if "single" in verbAnalysis: sentence.vsingle += verbAnalysis["single"] sentence.vaux = 0 if "aux" in verbAnalysis: sentence.vaux += verbAnalysis["aux"] sentence.vcompound = 0 if "compound" in verbAnalysis: sentence.vcompound += verbAnalysis["compound"] sentence.vindicative = 0 if "indicative" in verbAnalysis: sentence.vindicative += verbAnalysis["indicative"] sentence.vconditional = 0 if "conditional" in verbAnalysis: sentence.vconditional += verbAnalysis["conditional"] sentence.vsubjunctive = 0 if "subjonctive" in verbAnalysis: sentence.vsubjunctive += verbAnalysis["subjonctive"] sentence.vimperfect = 0 if "imperfect" in verbAnalysis: sentence.vimperfect += verbAnalysis["imperfect"] sentence.vfuture = 0 if "future" in verbAnalysis: sentence.vfuture += verbAnalysis["future"] sentence.vpresent = 0 if "present" in verbAnalysis: sentence.vpresent += verbAnalysis["present"] sentence.vnotense = 0 if "notense" in verbAnalysis: sentence.vnotense += verbAnalysis["notense"] # clause info: sentence.crel = 0 if "rel" in verbAnalysis: sentence.crel += verbAnalysis["rel"] sentence.cnom = 0 if "nom" in verbAnalysis: sentence.cnom += verbAnalysis["nom"] sentence.cacc = 0 if "acc" in verbAnalysis: sentence.cacc += verbAnalysis["acc"] sentence.cloc = 0 if "loc" in verbAnalysis: sentence.cloc += verbAnalysis["loc"] sxpipeSpellingChanges = 0 for i in range(len(tokens)): # skip multitoken elements, too hard if len(tok2finalforms[i+1]) > 1 or len(tok2finalforms[i+1][0].split(' ')) > 1: continue if tok2finalforms[i+1][0][0] == "_": continue t = tokens[i].lower() f = tok2finalforms[i+1][0].lower() if t != f: print "spelling?", t, f, tok2finalforms[i+1] sxpipeSpellingChanges += 1 sentence.spellingcorrections=sxpipeSpellingChanges ''' t = joinTokens([tokens[i].lower()]) f = joinTokens(tok2finalforms[i+1]).lower() print t, f if i > 0: # tok2finalforms first token is no. 1 if tok2finalforms[i+1] == tok2finalforms[i]: continue # don't look if two same wordforms in a row if t != f: # check its not a multiwoprint "spelling?", t, f, tok2finalforms[i+1] sxpipeSpellingChanges += 1rd thing tmpi = i+1 isDouble = False while f.startswith(t) and tmpi < len(tokens) and len(t) < len(f): t = joinTokens([t, tokens[tmpi]]) if t == f: isDouble = True break tmpi += 1 if not isDouble: print "spelling?", t, f, tok2finalforms[i+1] sxpipeSpellingChanges += 1 ''' ''' for s_token, observedtoken in zip([sentence.tokens[0]] + [sentence.tokens[i] for i in range(1,len(sentence.tokens)) if sentence.tokens[i-1].parseposition != \ sentence.tokens[i].parseposition], getTokensFromFile(processedTokenFile)): # the tokens file overrules the depxml, e.g. in depxml you have \? if s_token.observedform != observedtoken: print "combining:", s_token.frmgform, s_token.observedform, observedtoken s_token.observedform = observedtoken ''' #print words #print forms # we remove double entries from amalgams if debug: print "pre make regex:" ''' print [(sentence.tokens[i].parseposition[0], sentence.tokens[i-1].parseposition[-1], sentence.tokens[i].observedform) for i in range(1, len(sentence.tokens))] print [sentence.tokens[0].observedform] +[sentence.tokens[i].observedform for i in range(1,len(sentence.tokens)) if sentence.tokens[i].parseposition[0] > sentence.tokens[i-1].parseposition[-1]] ''' ''' obstokens = [] obstokenpositions = [] for token in sentence.tokens: for obstoken, obstokenposition in zip(token.observedform, token.parseposition): if obstokenposition not in obstokenpositions: obstokens.append(obstoken) obstokenpositions.append(obstokenposition) ''' obstokens = sentence.setAndGetUniqueTokens() #print "obstokens", obstokens sentence.matchregex = makeRegexFromTokens(obstokens) if debug: print "obstokens:", obstokens ''' sentence.matchregex = makeRegexFromTokens( [sentence.tokens[0].observedform] +[sentence.tokens[i].observedform for i in range(1,len(sentence.tokens)) if sentence.tokens[i].parseposition[0] > sentence.tokens[i-1].parseposition[-1]]) ''' return sentence