Пример #1
0
# testxmls = ["analysed_SpellChecker/entry_81/0/0/0/entry_81.E1.dep.xml"

testxmls = [
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E1.dep.xml",
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E5.dep.xml",
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E2.dep.xml",
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E6.dep.xml",
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E3.dep.xml",
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E7.dep.xml",
    "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellChecker/entry_350/0/0/0/entry_350.E4.dep.xml",
    ]

wordforms = []
for testxml in testxmls:
    print "loading xml", testxml
    tok2finalforms, tok2lemmacats, verb2info, trees, (weight, maxweight) = getFinalTokenFormsAndTreesAndWeight(testxml)
    # each tok2lemmacat goes to a list to 2-tuples
    # print tok2lemmacats.values()
    for lemmacats in tok2lemmacats.values():
        wordforms.extend([x[0] + u"_" + x[1] for x in lemmacats])

print wordforms
print "tokens", len(wordforms)
print "types", len(set(wordforms))
popt, pcov = calcPLex.calcPLex( wordforms, lemmacat2freqrank)

print "popt", popt
print "pcov", pcov

print "S"
popt, pcov = calcPLex.calcS( wordforms, lemmacat2freqrank)
import compareCorrectedCorpus

__author__ = 'nparslow'

for fnum in range(1, 18):
    xmlfile = "/home/nparslow/Documents/AutoCorrige/Corpora/analysed_CORPUS_CEFLE/Billy4/0/0/0/Billy4.E" + str(fnum) + ".dep.xml"

    finaltokens, origlemmacats, origverb2info, origtrees, origweight, wordsbeforemainverb =\
                compareCorrectedCorpus.getFinalTokenFormsAndTreesAndWeight(xmlfile)

    print fnum, wordsbeforemainverb
def getNextSentenceFromFiles( processedDepXMLFile, processedTokenFile, processedLogFile, debug=False ):

    #print processedSentenceFile
    sentence = None
    if os.path.isfile(processedDepXMLFile):
        sentence = Sentence()

        tree = ET.parse(processedDepXMLFile)
        # 'W' nodes are 'words' which can include multiple tokens, e.g. 'bien que' is one word
        # .iter for recursive, .findall for depth of 1
        # id the cluster then get the lex element from the cluster (we'll process it later)
        wordsforms = [(x.attrib['lemma'], x.attrib['form'], x.attrib['cluster'],
                       fixMixedEncodings(tree.findall("cluster[@id='" + x.attrib['cluster']+"']")[0].attrib["lex"]))
                      for x in tree.iter('node') if len(x.get('lemma'))>0 and x.get('lemma') != "_EPSILON"]
        print "wordsforms"
        print wordsforms
        # correct the encodings and remove epsilons

        tokens = getTokensFromFile(processedTokenFile)
        if len(wordsforms) > 0:
            # sentence was at least partially parsed
            #print "sorted word forms:"
            #print sorted(wordsforms, key=lambda x: x[2].split('_')[1])
            # sort by the start point of the token
            #re.sub(ur'E\d+F\d+\|', u'', x[3], flags=re.UNICODE)
            sentence.tokens = [Token(x[0], x[1], re.findall(ur'(?<=[F\d]\d\|)[^ ]+', x[3], flags=re.UNICODE),
                                     [int(x) for x in re.findall(ur'(?<=F)\d+(?=\|)', x[3], flags=re.UNICODE) ])
                               for x in sorted(wordsforms, key=lambda x: int(x[2].split('_')[1]))]
            #sentence.forms = [x[1] for x in sorted(wordsforms, key=lambda x: x[2].split('_')[1])]
        else:
            # sentence wasn't parsed, so use the tokens file:
            #tokens = getTokensFromFile(processedTokenFile)
            print "using tokens***"
            sentence.tokens = [Token(None, None, tokens[i], i) for i in range(len(tokens))]

        #print sentence.tokens
        #print "obs forms:", [x.observedform for x in sentence.tokens]

        tok2finalforms, tok2lemmacats, verb2info, trees, (weight, minweight) = \
            compareCorrectedCorpus.getFinalTokenFormsAndTreesAndWeight(processedDepXMLFile)
        sentence.setLemmaCats(tok2lemmacats)
        sentence.weightperword = weight
        sentence.minweight = minweight

        verbAnalysis = classifyVerbs.classifyVerbs(verb2info)
        sentence.vsingle = 0
        if "single" in verbAnalysis: sentence.vsingle += verbAnalysis["single"]
        sentence.vaux = 0
        if "aux" in verbAnalysis: sentence.vaux += verbAnalysis["aux"]
        sentence.vcompound = 0
        if "compound" in verbAnalysis: sentence.vcompound += verbAnalysis["compound"]
        sentence.vindicative = 0
        if "indicative" in verbAnalysis: sentence.vindicative += verbAnalysis["indicative"]
        sentence.vconditional = 0
        if "conditional" in verbAnalysis: sentence.vconditional += verbAnalysis["conditional"]
        sentence.vsubjunctive = 0
        if "subjonctive" in verbAnalysis: sentence.vsubjunctive += verbAnalysis["subjonctive"]
        sentence.vimperfect = 0
        if "imperfect" in verbAnalysis: sentence.vimperfect += verbAnalysis["imperfect"]
        sentence.vfuture = 0
        if "future" in verbAnalysis: sentence.vfuture += verbAnalysis["future"]
        sentence.vpresent = 0
        if "present" in verbAnalysis: sentence.vpresent += verbAnalysis["present"]
        sentence.vnotense = 0
        if "notense" in verbAnalysis: sentence.vnotense += verbAnalysis["notense"]
        # clause info:
        sentence.crel = 0
        if "rel" in verbAnalysis: sentence.crel += verbAnalysis["rel"]
        sentence.cnom = 0
        if "nom" in verbAnalysis: sentence.cnom += verbAnalysis["nom"]
        sentence.cacc = 0
        if "acc" in verbAnalysis: sentence.cacc += verbAnalysis["acc"]
        sentence.cloc = 0
        if "loc" in verbAnalysis: sentence.cloc += verbAnalysis["loc"]

        sxpipeSpellingChanges = 0
        for i in range(len(tokens)):
            # skip multitoken elements, too hard
            if len(tok2finalforms[i+1]) > 1 or len(tok2finalforms[i+1][0].split(' ')) > 1: continue
            if tok2finalforms[i+1][0][0] == "_": continue
            t = tokens[i].lower()
            f = tok2finalforms[i+1][0].lower()

            if t != f:
                print "spelling?", t, f, tok2finalforms[i+1]
                sxpipeSpellingChanges += 1
        sentence.spellingcorrections=sxpipeSpellingChanges
        '''
            t = joinTokens([tokens[i].lower()])
            f = joinTokens(tok2finalforms[i+1]).lower()

            print t, f
            if i > 0:
                # tok2finalforms first token is no. 1
                if tok2finalforms[i+1] == tok2finalforms[i]: continue # don't look if two same wordforms in a row
            if t != f:
                # check its not a multiwoprint "spelling?", t, f, tok2finalforms[i+1]
                    sxpipeSpellingChanges += 1rd thing
                tmpi = i+1
                isDouble = False
                while f.startswith(t) and tmpi < len(tokens) and len(t) < len(f):
                    t = joinTokens([t, tokens[tmpi]])
                    if t == f:
                        isDouble = True
                        break
                    tmpi += 1

                if not isDouble:
                    print "spelling?", t, f, tok2finalforms[i+1]
                    sxpipeSpellingChanges += 1
        '''


        '''
        for s_token, observedtoken in zip([sentence.tokens[0]] + [sentence.tokens[i] for i in range(1,len(sentence.tokens))
                                                                  if sentence.tokens[i-1].parseposition != \
                                                                     sentence.tokens[i].parseposition],
                                          getTokensFromFile(processedTokenFile)):
            # the tokens file overrules the depxml, e.g. in depxml you have \?
            if s_token.observedform != observedtoken:
                print "combining:", s_token.frmgform, s_token.observedform, observedtoken
                s_token.observedform = observedtoken
        '''
        #print words
        #print forms

        # we remove double entries from amalgams
        if debug: print "pre make regex:"
        '''
        print [(sentence.tokens[i].parseposition[0],
                sentence.tokens[i-1].parseposition[-1],
                sentence.tokens[i].observedform) for i in range(1, len(sentence.tokens))]
        print [sentence.tokens[0].observedform] +[sentence.tokens[i].observedform
                                   for i in range(1,len(sentence.tokens))
                                   if sentence.tokens[i].parseposition[0] > sentence.tokens[i-1].parseposition[-1]]
        '''
        '''
        obstokens = []
        obstokenpositions = []
        for token in sentence.tokens:
            for obstoken, obstokenposition in zip(token.observedform, token.parseposition):
                if obstokenposition not in obstokenpositions:
                    obstokens.append(obstoken)
                    obstokenpositions.append(obstokenposition)
        '''
        obstokens = sentence.setAndGetUniqueTokens()
        #print "obstokens", obstokens
        sentence.matchregex = makeRegexFromTokens(obstokens)
        if debug: print "obstokens:", obstokens
        '''
        sentence.matchregex = makeRegexFromTokens(
            [sentence.tokens[0].observedform] +[sentence.tokens[i].observedform
                                   for i in range(1,len(sentence.tokens))
                                   if sentence.tokens[i].parseposition[0] > sentence.tokens[i-1].parseposition[-1]])
        '''
    return sentence