def fileStats(self, path, fileids): # starting the title tit = "Base GQs (patterns)" # all C11 = MyClassStats2(all1[0].strip(), [], 0, tit) C12 = MyClassStats2(all2[0].strip(), [], 0, tit) C13 = MyClassStats2(all3[0].strip(), [], 0, tit) C14 = MyClassStats2(all4[0].strip(), [], 0, tit) C15 = MyClassStats2(all5[0].strip(), [], 0, tit) C16 = MyClassStats2(all6[0].strip(), [], 0, tit) C17 = MyClassStats2(all7[0].strip(), [], 0, tit) C18 = MyClassStats2(all8[0].strip(), [], 0, tit) # some C21 = MyClassStats2(some1[0].strip(), [], 0, tit) C22 = MyClassStats2(some2[0].strip(), [], 0, tit) C23 = MyClassStats2(some3[0].strip(), [], 0, tit) C24 = MyClassStats2(some4[0].strip(), [], 0, tit) C25 = MyClassStats2(some5[0].strip(), [], 0, tit) C26 = MyClassStats2(some6[0].strip(), [], 0, tit) # > k C41 = MyClassStats2(morek1[0].strip(), [], 0, tit) C42 = MyClassStats2(morek2[0].strip(), [], 0, tit) C43 = MyClassStats2(morek3[0].strip(), [], 0, tit) # < k C51 = MyClassStats2(lessk1[0].strip(), [], 0, tit) C52 = MyClassStats2(lessk2[0].strip(), [], 0, tit) C53 = MyClassStats2(lessk3[0].strip(), [], 0, tit) # k C61 = MyClassStats2(exactlyk1[0].strip(), [], 0, tit) C62 = MyClassStats2(exactlyk2[0].strip(), [], 0, tit) C63 = MyClassStats2(exactlyk3[0].strip(), [], 0, tit) C64 = MyClassStats2(exactlyk4[0].strip(), [], 0, tit) # most C71 = MyClassStats2(most1[0].strip(), [], 0, tit) C72 = MyClassStats2(most2[0].strip(), [], 0, tit) C73 = MyClassStats2(most3[0].strip(), [], 0, tit) C74 = MyClassStats2(most4[0].strip(), [], 0, tit) C75 = MyClassStats2(most5[0].strip(), [], 0, tit) # few C81 = MyClassStats2(few1[0].strip(), [], 0, tit) C82 = MyClassStats2(few2[0].strip(), [], 0, tit) C83 = MyClassStats2(few3[0].strip(), [], 0, tit) C84 = MyClassStats2(few4[0].strip(), [], 0, tit) # > p/k C91 = MyClassStats2(morethanpro1[0].strip(), [], 0, tit) C92 = MyClassStats2(morethanpro2[0].strip(), [], 0, tit) C93 = MyClassStats2(morethanpro3[0].strip(), [], 0, tit) C94 = MyClassStats2(morethanpro4[0].strip(), [], 0, tit) # < p/k C101 = MyClassStats2(lessthanpro1[0].strip(), [], 0, tit) C102 = MyClassStats2(lessthanpro2[0].strip(), [], 0, tit) C103 = MyClassStats2(lessthanpro3[0].strip(), [], 0, tit) C104 = MyClassStats2(lessthanpro4[0].strip(), [], 0, tit) # p/k C131 = MyClassStats2(pro1[0].strip(), [], 0, tit) C132 = MyClassStats2(pro2[0].strip(), [], 0, tit) C133 = MyClassStats2(pro3[0].strip(), [], 0, tit) C134 = MyClassStats2(pro4[0].strip(), [], 0, tit) C135 = MyClassStats2(pro5[0].strip(), [], 0, tit) # > p% C111 = MyClassStats2(morekper1[0].strip(), [], 0, tit) C112 = MyClassStats2(morekper2[0].strip(), [], 0, tit) C113 = MyClassStats2(morekper3[0].strip(), [], 0, tit) C114 = MyClassStats2(morekper4[0].strip(), [], 0, tit) # < p% C121 = MyClassStats2(lesskper1[0].strip(), [], 0, tit) C122 = MyClassStats2(lesskper2[0].strip(), [], 0, tit) C123 = MyClassStats2(lesskper3[0].strip(), [], 0, tit) C124 = MyClassStats2(lesskper4[0].strip(), [], 0, tit) C125 = MyClassStats2(lesskper5[0].strip(), [], 0, tit) # p% C141 = MyClassStats2(kper1[0].strip(), [], 0, tit) C142 = MyClassStats2(kper2[0].strip(), [], 0, tit) self.classstats = [ C11, C12, C13, C14, C15, C16, C17, C18, C21, C22, C23, C24, C25, C26, C41, C42, C43, C51, C52, C53, C61, C62, C63, C64, C71, C72, C73, C74, C75, C81, C82, C83, C84, C91, C92, C93, C94, C101, C102, C103, C104, C131, C132, C133, C134, C135, C111, C112, C113, C114, C121, C122, C123, C124, C125, C141, C142 ] print "###################################################" print "GQ STATS" print "###################################################" # computing the stats for idf in fileids: #################################################################### filestats = [] mydata = OpenFile(path + '/' + idf) mydata.lines = mydata.myread() #################################################################### #print "===================================================" print idf print "===================================================" #################################################################### # patterns rest = [] # corpus corpus = MyClass2([".*"], [], idf, 0, 0, "corpus") #################################################################### # all P11 = MyPatts2(all1).P N11 = MyPatts2(rest).P c11 = MyClass2(P11, N11, idf, 0, 0, all1[0].strip()) P12 = MyPatts2(all2).P N12 = MyPatts2(rest).P c12 = MyClass2(P12, N12, idf, 0, 0, all2[0].strip()) P13 = MyPatts2(all3).P N13 = MyPatts2(rest).P c13 = MyClass2(P13, N13, idf, 0, 0, all3[0].strip()) P14 = MyPatts2(all4).P N14 = MyPatts2(rest).P c14 = MyClass2(P14, N14, idf, 0, 0, all4[0].strip()) P15 = MyPatts2(all5).P N15 = MyPatts2(rest).P c15 = MyClass2(P15, N15, idf, 0, 0, all5[0].strip()) P16 = MyPatts2(all6).P N16 = MyPatts2(rest).P c16 = MyClass2(P16, N16, idf, 0, 0, all6[0].strip()) P17 = MyPatts2(all7).P N17 = MyPatts2(rest).P c17 = MyClass2(P17, N17, idf, 0, 0, all7[0].strip()) P18 = MyPatts2(all8).P N18 = MyPatts2(rest).P c18 = MyClass2(P18, N18, idf, 0, 0, all8[0].strip()) # some P21 = MyPatts2(some1).P N21 = MyPatts2(rest).P c21 = MyClass2(P21, N21, idf, 0, 0, some1[0].strip()) P22 = MyPatts2(some2).P N22 = MyPatts2(rest).P c22 = MyClass2(P22, N22, idf, 0, 0, some2[0].strip()) P23 = MyPatts2(some3).P N23 = MyPatts2(rest).P c23 = MyClass2(P23, N23, idf, 0, 0, some3[0].strip()) P24 = MyPatts2(some4).P N24 = MyPatts2(rest).P c24 = MyClass2(P24, N24, idf, 0, 0, some4[0].strip()) P25 = MyPatts2(some5).P N25 = MyPatts2(rest).P c25 = MyClass2(P25, N25, idf, 0, 0, some5[0].strip()) P26 = MyPatts2(some6).P N26 = MyPatts2(rest).P c26 = MyClass2(P26, N26, idf, 0, 0, some6[0].strip()) #################################################################### # >k P41 = MyPatts2(morek1).P N41 = MyPatts2(rest).P c41 = MyClass2(P41, N41, idf, 0, 0, morek1[0].strip()) P42 = MyPatts2(morek2).P N42 = MyPatts2(rest).P c42 = MyClass2(P42, N42, idf, 0, 0, morek2[0].strip()) P43 = MyPatts2(morek3).P N43 = MyPatts2(rest).P c43 = MyClass2(P43, N43, idf, 0, 0, morek3[0].strip()) # <k P51 = MyPatts2(lessk1).P N51 = MyPatts2(rest).P c51 = MyClass2(P51, N51, idf, 0, 0, lessk2[0].strip()) P52 = MyPatts2(lessk2).P N52 = MyPatts2(rest).P c52 = MyClass2(P52, N52, idf, 0, 0, lessk2[0].strip()) P53 = MyPatts2(lessk3).P N53 = MyPatts2(rest).P c53 = MyClass2(P53, N53, idf, 0, 0, lessk3[0].strip()) # k P61 = MyPatts2(exactlyk1).P N61 = MyPatts2(rest).P c61 = MyClass2(P61, N61, idf, 0, 0, exactlyk1[0].strip()) P62 = MyPatts2(exactlyk2).P N62 = MyPatts2(rest).P c62 = MyClass2(P62, N62, idf, 0, 0, exactlyk2[0].strip()) P63 = MyPatts2(exactlyk3).P N63 = MyPatts2(rest).P c63 = MyClass2(P63, N63, idf, 0, 0, exactlyk3[0].strip()) P64 = MyPatts2(exactlyk4).P N64 = MyPatts2(rest).P c64 = MyClass2(P64, N64, idf, 0, 0, exactlyk4[0].strip()) #################################################################### # most P71 = MyPatts2(most1).P N71 = MyPatts2(rest).P c71 = MyClass2(P71, N71, idf, 0, 0, most1[0].strip()) P72 = MyPatts2(most2).P N72 = MyPatts2(rest).P c72 = MyClass2(P72, N72, idf, 0, 0, most2[0].strip()) P73 = MyPatts2(most3).P N73 = MyPatts2(rest).P c73 = MyClass2(P73, N73, idf, 0, 0, most3[0].strip()) P74 = MyPatts2(most4).P N74 = MyPatts2(rest).P c74 = MyClass2(P74, N74, idf, 0, 0, most4[0].strip()) P75 = MyPatts2(most5).P N75 = MyPatts2(rest).P c75 = MyClass2(P75, N75, idf, 0, 0, most5[0].strip()) # few P81 = MyPatts2(few1).P N81 = MyPatts2(rest).P c81 = MyClass2(P81, N81, idf, 0, 0, few1[0].strip()) P82 = MyPatts2(few2).P N82 = MyPatts2(rest).P c82 = MyClass2(P82, N82, idf, 0, 0, few2[0].strip()) P83 = MyPatts2(few3).P N83 = MyPatts2(rest).P c83 = MyClass2(P83, N83, idf, 0, 0, few3[0].strip()) P84 = MyPatts2(few4).P N84 = MyPatts2(rest).P c84 = MyClass2(P84, N84, idf, 0, 0, few4[0].strip()) # >k/100 P91 = MyPatts2(morekper1).P N91 = MyPatts2(rest).P c91 = MyClass2(P91, N91, idf, 0, 0, morekper1[0].strip()) P92 = MyPatts2(morekper2).P N92 = MyPatts2(rest).P c92 = MyClass2(P92, N92, idf, 0, 0, morekper2[0].strip()) P93 = MyPatts2(morekper3).P N93 = MyPatts2(rest).P c93 = MyClass2(P93, N93, idf, 0, 0, morekper3[0].strip()) P94 = MyPatts2(morekper4).P N94 = MyPatts2(rest).P c94 = MyClass2(P94, N94, idf, 0, 0, morekper4[0].strip()) # <k/100 P101 = MyPatts2(lesskper1).P N101 = MyPatts2(rest).P c101 = MyClass2(P101, N101, idf, 0, 0, lesskper1[0].strip()) P102 = MyPatts2(lesskper2).P N102 = MyPatts2(rest).P c102 = MyClass2(P102, N102, idf, 0, 0, lesskper2[0].strip()) P103 = MyPatts2(lesskper3).P N103 = MyPatts2(rest).P c103 = MyClass2(P103, N103, idf, 0, 0, lesskper3[0].strip()) P104 = MyPatts2(lesskper4).P N104 = MyPatts2(rest).P c104 = MyClass2(P104, N104, idf, 0, 0, lesskper4[0].strip()) P105 = MyPatts2(lesskper5).P N105 = MyPatts2(rest).P c105 = MyClass2(P105, N105, idf, 0, 0, lesskper5[0].strip()) # k/100 P131 = MyPatts2(kper1).P N131 = MyPatts2(rest).P c131 = MyClass2(P131, N131, idf, 0, 0, kper1[0].strip()) P132 = MyPatts2(kper2).P N132 = MyPatts2(rest).P c132 = MyClass2(P132, N132, idf, 0, 0, kper2[0].strip()) # >p/k P111 = MyPatts2(morethanpro1).P N111 = MyPatts2(rest).P c111 = MyClass2(P111, N111, idf, 0, 0, morethanpro1[0].strip()) P112 = MyPatts2(morethanpro2).P N112 = MyPatts2(rest).P c112 = MyClass2(P112, N112, idf, 0, 0, morethanpro2[0].strip()) P113 = MyPatts2(morethanpro3).P N113 = MyPatts2(rest).P c113 = MyClass2(P113, N113, idf, 0, 0, morethanpro3[0].strip()) P114 = MyPatts2(morethanpro4).P N114 = MyPatts2(rest).P c114 = MyClass2(P114, N114, idf, 0, 0, morethanpro4[0].strip()) # <p/k P121 = MyPatts2(lessthanpro1).P N121 = MyPatts2(rest).P c121 = MyClass2(P121, N121, idf, 0, 0, lessthanpro1[0].strip()) P122 = MyPatts2(lessthanpro2).P N122 = MyPatts2(rest).P c122 = MyClass2(P122, N122, idf, 0, 0, lessthanpro2[0].strip()) P123 = MyPatts2(lessthanpro3).P N123 = MyPatts2(rest).P c123 = MyClass2(P123, N123, idf, 0, 0, lessthanpro3[0].strip()) P124 = MyPatts2(lessthanpro4).P N124 = MyPatts2(rest).P c124 = MyClass2(P124, N124, idf, 0, 0, lessthanpro4[0].strip()) # p/k P141 = MyPatts2(pro1).P N141 = MyPatts2(rest).P c141 = MyClass2(P141, N141, idf, 0, 0, pro1[0].strip()) P142 = MyPatts2(pro2).P N142 = MyPatts2(rest).P c142 = MyClass2(P142, N142, idf, 0, 0, pro2[0].strip()) P143 = MyPatts2(pro3).P N143 = MyPatts2(rest).P c143 = MyClass2(P143, N143, idf, 0, 0, pro3[0].strip()) P144 = MyPatts2(pro4).P N144 = MyPatts2(rest).P c144 = MyClass2(P144, N144, idf, 0, 0, pro4[0].strip()) P145 = MyPatts2(pro5).P N145 = MyPatts2(rest).P c145 = MyClass2(P145, N145, idf, 0, 0, pro5[0].strip()) #################################################################### #################################################################### # examine only k chunks of the big file at a time while mydata.lines: i = 0 my_max = len(mydata.lines) # loop over chunk while i < my_max: # parse the chunk lines = mydata.lines line = mydata.lines[i] # build sentence sen = MySen() sen.buildSen(i, lines, my_max) # if sentence built, apply patterns if sen.end == True: # retrieve POS tagged sentence myline = sen.sen #################################################################### # corpus corpus.openSen(myline, corpus.pats, corpus.patts) #################################################################### #################################################################### # all c11.openSen(myline, c11.pats, c11.patts) c12.openSen(myline, c12.pats, c12.patts) c13.openSen(myline, c13.pats, c13.patts) c14.openSen(myline, c14.pats, c14.patts) c15.openSen(myline, c15.pats, c16.patts) c16.openSen(myline, c16.pats, c16.patts) c17.openSen(myline, c17.pats, c17.patts) c18.openSen(myline, c18.pats, c18.patts) # some c21.openSen(myline, c21.pats, c21.patts) c22.openSen(myline, c22.pats, c22.patts) c23.openSen(myline, c23.pats, c23.patts) c24.openSen(myline, c24.pats, c24.patts) c25.openSen(myline, c25.pats, c25.patts) c26.openSen(myline, c26.pats, c26.patts) # > k c41.openSen(myline, c41.pats, c41.patts) c42.openSen(myline, c42.pats, c42.patts) c43.openSen(myline, c43.pats, c43.patts) # < k c51.openSen(myline, c51.pats, c51.patts) c52.openSen(myline, c52.pats, c52.patts) c53.openSen(myline, c53.pats, c53.patts) # k c61.openSen(myline, c61.pats, c61.patts) c62.openSen(myline, c62.pats, c62.patts) c63.openSen(myline, c63.pats, c63.patts) c64.openSen(myline, c64.pats, c64.patts) # most c71.openSen(myline, c71.pats, c71.patts) c72.openSen(myline, c72.pats, c72.patts) c73.openSen(myline, c73.pats, c73.patts) c74.openSen(myline, c74.pats, c74.patts) c75.openSen(myline, c75.pats, c75.patts) # few c81.openSen(myline, c81.pats, c81.patts) c82.openSen(myline, c82.pats, c82.patts) c83.openSen(myline, c83.pats, c83.patts) c84.openSen(myline, c84.pats, c84.patts) # >k/100 c91.openSen(myline, c91.pats, c91.patts) c92.openSen(myline, c92.pats, c92.patts) c93.openSen(myline, c93.pats, c93.patts) c94.openSen(myline, c94.pats, c94.patts) # <k/100 c101.openSen(myline, c101.pats, c101.patts) c102.openSen(myline, c102.pats, c102.patts) c103.openSen(myline, c103.pats, c103.patts) c104.openSen(myline, c104.pats, c104.patts) c105.openSen(myline, c105.pats, c105.patts) # k/100 c131.openSen(myline, c131.pats, c131.patts) c132.openSen(myline, c132.pats, c132.patts) # > p/k c111.openSen(myline, c111.pats, c111.patts) c112.openSen(myline, c112.pats, c112.patts) c113.openSen(myline, c113.pats, c113.patts) c114.openSen(myline, c114.pats, c114.patts) # < p/k c121.openSen(myline, c121.pats, c121.patts) c122.openSen(myline, c122.pats, c122.patts) c123.openSen(myline, c123.pats, c123.patts) c124.openSen(myline, c124.pats, c124.patts) # p/k c141.openSen(myline, c141.pats, c141.patts) c142.openSen(myline, c142.pats, c142.patts) c143.openSen(myline, c143.pats, c143.patts) c144.openSen(myline, c144.pats, c144.patts) c145.openSen(myline, c145.pats, c145.patts) #################################################################### # if a sentence is found, skip the lines it # covers in the loop, otherwise move to the # next line if sen.len > 0: i = i + sen.len # print 'senlen=', sen.len, '\n' # print 'sen= ', sen.sen, '\n' else: i = i + 1 # print 'explore at line= ', i, '\n' # move to new chunk mydata.lines = mydata.myread() #################################################################### #################################################################### # total cum count tot = (c11.count + c12.count + c13.count + c14.count + c15.count + c16.count + c17.count + c18.count + c21.count + c22.count + c23.count + c24.count + c25.count + c26.count + c41.count + c42.count + c43.count + c51.count + c52.count + c53.count + c61.count + c62.count + c63.count + c64.count + c71.count + c72.count + c73.count + c74.count + c75.count + c81.count + c82.count + c83.count + c84.count + c91.count + c92.count + c93.count + c94.count + c101.count + c102.count + c103.count + c104.count + c105.count + c111.count + c112.count + c113.count + c114.count + c121.count + c122.count + c123.count + c124.count + c131.count + c132.count + c141.count + c142.count + c143.count + c144.count + c145.count) + 1 print "corpus size : " + ` corpus.count ` + " sentences" print "===================================================" print "total matches: " + ` tot ` + " GQs" #################################################################### filestats = [ c11, c12, c13, c14, c15, c16, c17, c18, c21, c22, c23, c24, c25, c26, c41, c42, c43, c51, c52, c53, c61, c62, c63, c64, c71, c72, c73, c74, c75, c81, c82, c83, c84, c91, c92, c93, c94, c101, c102, c103, c104, c105, c131, c132, c111, c112, c113, c114, c121, c122, c123, c124, c141, c142, c143, c144, c145 ] #################################################################### self.stats[idf] = filestats #################################################################### for cla in self.classstats: for thiscls in filestats: if (thiscls.tag == cla.tag): cla.classes.append(thiscls) #################################################################### # updating the distribution #self.classAvg(self.classstats) self.classAvg2(self.classstats) #sort = self.sortClass(self.classstats) #self.classstats = sort print "###################################################"
def fileStats(self, path, fileids): # starting the title tit = "Base GQs" # stat classes C1 = MyClassStats2("all", [], 0, tit) C2 = MyClassStats2("some", [], 0, tit) C4 = MyClassStats2(">k", [], 0, tit) C5 = MyClassStats2("<k", [], 0, tit) C6 = MyClassStats2("k", [], 0, tit) C7 = MyClassStats2("most", [], 0, tit) C8 = MyClassStats2("few", [], 0, tit) C9 = MyClassStats2(">p/k", [], 0, tit) C10 = MyClassStats2("<p/k", [], 0, tit) C13 = MyClassStats2("p/k", [], 0, tit) C11 = MyClassStats2(">k/100", [], 0, tit) C12 = MyClassStats2("<k/100", [], 0, tit) C14 = MyClassStats2("k/100", [], 0, tit) self.classstats = [ C1, C2, C4, C5, C6, C7, C8, C9, C10, C13, C11, C12, C14 ] print "###################################################" print "GQ STATS" print "###################################################" # computing the stats for idf in fileids: #################################################################### filestats = [] mydata = OpenFile(path + '/' + idf) mydata.lines = mydata.myread() #################################################################### #print "===================================================" print idf print "===================================================" #################################################################### # patterns rest = [] # corpus corpus = MyClass2([".*"], [], idf, 0, 0, "corpus") #################################################################### # some P1 = MyPatts2(some).P N1 = MyPatts2(rest).P c1 = MyClass2(P1, N1, idf, 0, 0, "some") # all P2 = MyPatts2(all).P N2 = MyPatts2(rest).P c2 = MyClass2(P2, N2, idf, 0, 0, "all") #################################################################### # >k P4 = MyPatts2(morek).P N4 = MyPatts2(rest).P c4 = MyClass2(P4, N4, idf, 0, 0, ">k") # <k P5 = MyPatts2(lessk).P N5 = MyPatts2(rest).P c5 = MyClass2(P5, N5, idf, 0, 0, "<k") # k P6 = MyPatts2(exactlyk).P N6 = MyPatts2(rest).P c6 = MyClass2(P6, N6, idf, 0, 0, "k") #################################################################### # most P7 = MyPatts2(most).P N7 = MyPatts2(rest).P c7 = MyClass2(P7, N7, idf, 0, 0, "most") # few P8 = MyPatts2(few).P N8 = MyPatts2(rest).P #few c8 = MyClass2(P8, N8, idf, 0, 0, "few") # >k/100 P9 = MyPatts2(morekper).P N9 = MyPatts2(rest).P c9 = MyClass2(P9, N9, idf, 0, 0, ">k/100") # <k/100 P10 = MyPatts2(lesskper).P N10 = MyPatts2(rest).P c10 = MyClass2(P10, N10, idf, 0, 0, "<k/100") # k/100 P13 = MyPatts2(kper).P N13 = MyPatts2(rest).P c13 = MyClass2(P13, N13, idf, 0, 0, "k/100") # >p/k P11 = MyPatts2(morethanpro).P N11 = MyPatts2(rest).P c11 = MyClass2(P11, N11, idf, 0, 0, ">p/k") # <p/k P12 = MyPatts2(lessthanpro).P N12 = MyPatts2(rest).P c12 = MyClass2(P12, N12, idf, 0, 0, "<p/k") # p/k P14 = MyPatts2(pro).P N14 = MyPatts2(rest).P c14 = MyClass2(P14, N14, idf, 0, 0, "p/k") #################################################################### #################################################################### # examine only k chunks of the big file at a time while mydata.lines: i = 0 my_max = len(mydata.lines) # loop over chunk while i < my_max: # parse the chunk lines = mydata.lines line = mydata.lines[i] # build sentence sen = MySen() sen.buildSen(i, lines, my_max) # if sentence built, apply patterns if sen.end == True: # retrieve POS tagged sentence myline = sen.sen #################################################################### # corpus corpus.openSen(myline, corpus.pats, corpus.patts) #################################################################### #################################################################### # some c1.openSen(myline, c1.pats, c1.patts) #################################################################### # all c2.openSen(myline, c2.pats, c2.patts) #################################################################### #################################################################### # >k c4.openSen(myline, c4.pats, c4.patts) #################################################################### # <k c5.openSen(myline, c5.pats, c5.patts) #################################################################### # k c6.openSen(myline, c6.pats, c6.patts) #################################################################### #################################################################### # most c7.openSen(myline, c7.pats, c7.patts) #################################################################### #few c8.openSen(myline, c8.pats, c8.patts) #################################################################### #>k/100 c9.openSen(myline, c9.pats, c9.patts) #################################################################### #<k100 c10.openSen(myline, c10.pats, c10.patts) #################################################################### # k/100 c13.openSen(myline, c13.pats, c13.patts) #################################################################### # >p/k c11.openSen(myline, c11.pats, c11.patts) #################################################################### # <p/k c12.openSen(myline, c12.pats, c12.patts) #################################################################### # p/k c14.openSen(myline, c14.pats, c14.patts) #################################################################### # if a sentence is found, skip the lines it # covers in the loop, otherwise move to the # next line if sen.len > 0: i = i + sen.len # print 'senlen=', sen.len, '\n' # print 'sen= ', sen.sen, '\n' else: i = i + 1 # print 'explore at line= ', i, '\n' # move to new chunk mydata.lines = mydata.myread() #################################################################### #################################################################### # total cum count tot = (c1.count + c2.count + c4.count + c5.count + +c6.count + c7.count + c8.count + c9.count + c10.count + c11.count + c12.count + c13.count + c14.count) + 1 print "corpus size : " + ` corpus.count ` + " sentences" print "===================================================" print "total matches: " + ` tot ` + " GQs" #relative frequencies c1.freq = round(c1.count / tot, 2) c2.freq = round(c2.count / tot, 2) c4.freq = round(c4.count / tot, 2) c5.freq = round(c5.count / tot, 2) c6.freq = round(c6.count / tot, 2) c7.freq = round(c7.count / tot, 2) c8.freq = round(c8.count / tot, 2) c9.freq = round(c9.count / tot, 2) c10.freq = round(c10.count / tot, 2) c11.freq = round(c11.count / tot, 2) c12.freq = round(c12.count / tot, 2) c13.freq = round(c13.count / tot, 2) c14.freq = round(c14.count / tot, 2) #################################################################### filestats = [ c1, c2, c4, c5, c6, c7, c8, c9, c10, c13, c11, c12, c14 ] #################################################################### self.stats[idf] = filestats #################################################################### for cla in self.classstats: for thiscls in filestats: if (thiscls.tag == cla.tag): cla.classes.append(thiscls) #################################################################### # updating the distribution self.classAvg(self.classstats) self.classAvg2(self.classstats) sort = self.sortClass(self.classstats) self.classstats = sort print "###################################################" self.printClasses(self.classstats)
def fileStats(self, path, fileids): # starting the title tit = "Base GQs (disjoint patterns)" # all C11 = MyClassStats2(all1[0].strip(), [], 0, tit) C12 = MyClassStats2(all2[0].strip(), [], 0, tit) C13 = MyClassStats2(all3[0].strip(), [], 0, tit) C17 = MyClassStats2(all7[0].strip(), [], 0, tit) C18 = MyClassStats2(all8[0].strip(), [], 0, tit) # some C25 = MyClassStats2(some5[0].strip(), [], 0, tit) # > k C41 = MyClassStats2(morek1[0].strip(), [], 0, tit) C42 = MyClassStats2(morek2[0].strip(), [], 0, tit) C43 = MyClassStats2(morek3[0].strip(), [], 0, tit) # < k C51 = MyClassStats2(lessk1[0].strip(), [], 0, tit) C52 = MyClassStats2(lessk2[0].strip(), [], 0, tit) C53 = MyClassStats2(lessk3[0].strip(), [], 0, tit) C54 = MyClassStats2(lessk4[0].strip(), [], 0, tit) C55 = MyClassStats2(lessk5[0].strip(), [], 0, tit) # most C71 = MyClassStats2(most1[0].strip(), [], 0, tit) C72 = MyClassStats2(most2[0].strip(), [], 0, tit) C73 = MyClassStats2(most3[0].strip(), [], 0, tit) C74 = MyClassStats2(most4[0].strip(), [], 0, tit) C75 = MyClassStats2(most5[0].strip(), [], 0, tit) # few C81 = MyClassStats2(few1[0].strip(), [], 0, tit) C82 = MyClassStats2(few2[0].strip(), [], 0, tit) C83 = MyClassStats2(few3[0].strip(), [], 0, tit) C84 = MyClassStats2(few4[0].strip(), [], 0, tit) # > p/k C91 = MyClassStats2(morethanpro1[0].strip(), [], 0, tit) C92 = MyClassStats2(morethanpro2[0].strip(), [], 0, tit) C93 = MyClassStats2(morethanpro3[0].strip(), [], 0, tit) C94 = MyClassStats2(morethanpro4[0].strip(), [], 0, tit) C95 = MyClassStats2(morethanpro5[0].strip(), [], 0, tit) C96 = MyClassStats2(morethanpro6[0].strip(), [], 0, tit) # < p/k C101 = MyClassStats2(lessthanpro1[0].strip(), [], 0, tit) C102 = MyClassStats2(lessthanpro2[0].strip(), [], 0, tit) C103 = MyClassStats2(lessthanpro3[0].strip(), [], 0, tit) C104 = MyClassStats2(lessthanpro4[0].strip(), [], 0, tit) C105 = MyClassStats2(lessthanpro5[0].strip(), [], 0, tit) C106 = MyClassStats2(lessthanpro6[0].strip(), [], 0, tit) C107 = MyClassStats2(lessthanpro7[0].strip(), [], 0, tit) self.classstats = [ C11, C12, C13, C17, C18, C25, C41, C42, C43, C51, C52, C53, C54, C55, C71, C72, C73, C74, C75, C81, C82, C83, C84, C91, C92, C93, C94, C95, C96, C101, C102, C103, C104, C105, C106, C107 ] print "###################################################" print "GQ STATS" print "###################################################" # computing the stats for idf in fileids: #################################################################### filestats = [] mydata = OpenFile(path + '/' + idf) mydata.lines = mydata.myread() #################################################################### #print "===================================================" print idf print "===================================================" #################################################################### # patterns rest = [] # digits digit = [" @card@/cd "] # corpus corpus = MyClass2([".*"], [], idf, 0, 0, "corpus") #################################################################### # all P11 = MyPatts2(all1).P N11 = MyPatts2(rest).P c11 = MyClass2(P11, N11, idf, 0, 0, all1[0].strip()) P12 = MyPatts2(all2).P N12 = MyPatts2(rest).P c12 = MyClass2(P12, N12, idf, 0, 0, all2[0].strip()) P13 = MyPatts2(all3).P N13 = MyPatts2(rest).P c13 = MyClass2(P13, N13, idf, 0, 0, all3[0].strip()) P17 = MyPatts2(all7).P N17 = MyPatts2(rest).P c17 = MyClass2(P17, N17, idf, 0, 0, all7[0].strip()) P18 = MyPatts2(all8).P N18 = MyPatts2(rest).P c18 = MyClass2(P18, N18, idf, 0, 0, all8[0].strip()) # some P25 = MyPatts2(some5).P N25 = MyPatts2(rest).P c25 = MyClass2(P25, N25, idf, 0, 0, some5[0].strip()) #################################################################### # >k P41 = MyPatts2(morek1).P N41 = MyPatts2(digit).P c41 = MyClass2(P41, N41, idf, 0, 0, morek1[0].strip()) P42 = MyPatts2(morek2).P N42 = MyPatts2(digit).P c42 = MyClass2(P42, N42, idf, 0, 0, morek2[0].strip()) P43 = MyPatts2(morek3).P N43 = MyPatts2(digit).P c43 = MyClass2(P43, N43, idf, 0, 0, morek3[0].strip()) # <k P51 = MyPatts2(lessk1).P N51 = MyPatts2(digit).P c51 = MyClass2(P51, N51, idf, 0, 0, lessk2[0].strip()) P52 = MyPatts2(lessk2).P N52 = MyPatts2(digit).P c52 = MyClass2(P52, N52, idf, 0, 0, lessk2[0].strip()) P53 = MyPatts2(lessk3).P N53 = MyPatts2(digit).P c53 = MyClass2(P53, N53, idf, 0, 0, lessk3[0].strip()) P54 = MyPatts2(lessk4).P N54 = MyPatts2(digit).P c54 = MyClass2(P54, N54, idf, 0, 0, lessk4[0].strip()) P55 = MyPatts2(lessk5).P N55 = MyPatts2(digit).P c55 = MyClass2(P55, N55, idf, 0, 0, lessk5[0].strip()) #################################################################### # most P71 = MyPatts2(most1).P N71 = MyPatts2(nomost + lessthanpro6 + lessthanpro7 + lessthanpro3 + lessk1 + lessk2).P c71 = MyClass2(P71, N71, idf, 0, 0, most1[0].strip()) P72 = MyPatts2(most2).P N72 = MyPatts2(nomost + lessthanpro6 + lessthanpro7 + lessthanpro3 + lessk1 + lessk2).P c72 = MyClass2(P72, N72, idf, 0, 0, most2[0].strip()) P73 = MyPatts2(most3).P N73 = MyPatts2(nomost + lessthanpro6 + lessthanpro7 + lessthanpro3 + lessk1 + lessk2).P c73 = MyClass2(P73, N73, idf, 0, 0, most3[0].strip()) P74 = MyPatts2(most4).P N74 = MyPatts2(nomost + lessthanpro6 + lessthanpro7 + lessthanpro3 + lessk1 + lessk2).P c74 = MyClass2(P74, N74, idf, 0, 0, most4[0].strip()) P75 = MyPatts2(most5).P N75 = MyPatts2(nomost + lessthanpro6 + lessthanpro7 + lessthanpro3 + lessk1 + lessk2).P c75 = MyClass2(P75, N75, idf, 0, 0, most5[0].strip()) # few P81 = MyPatts2(few1).P N81 = MyPatts2(nofew).P c81 = MyClass2(P81, N81, idf, 0, 0, few1[0].strip()) P82 = MyPatts2(few2).P N82 = MyPatts2(nofew).P c82 = MyClass2(P82, N82, idf, 0, 0, few2[0].strip()) P83 = MyPatts2(few3).P N83 = MyPatts2(nofew).P c83 = MyClass2(P83, N83, idf, 0, 0, few3[0].strip()) P84 = MyPatts2(few4).P N84 = MyPatts2(nofew).P c84 = MyClass2(P84, N84, idf, 0, 0, few4[0].strip()) # >p/k P91 = MyPatts2(morethanpro1).P N91 = MyPatts2(digit).P c91 = MyClass2(P91, N91, idf, 0, 0, morethanpro1[0].strip()) P92 = MyPatts2(morethanpro2).P N92 = MyPatts2(digit).P c92 = MyClass2(P92, N92, idf, 0, 0, morethanpro2[0].strip()) P93 = MyPatts2(morethanpro3).P N93 = MyPatts2(digit).P c93 = MyClass2(P93, N93, idf, 0, 0, morethanpro3[0].strip()) P94 = MyPatts2(morethanpro4).P N94 = MyPatts2(digit).P c94 = MyClass2(P94, N94, idf, 0, 0, morethanpro4[0].strip()) P95 = MyPatts2(morethanpro5).P N95 = MyPatts2(digit).P c95 = MyClass2(P95, N95, idf, 0, 0, morethanpro5[0].strip()) P96 = MyPatts2(morethanpro6).P N96 = MyPatts2(digit).P c96 = MyClass2(P96, N96, idf, 0, 0, morethanpro6[0].strip()) # <p/k P121 = MyPatts2(lessthanpro1).P N121 = MyPatts2(digit).P c121 = MyClass2(P121, N121, idf, 0, 0, lessthanpro1[0].strip()) P122 = MyPatts2(lessthanpro2).P N122 = MyPatts2(digit).P c122 = MyClass2(P122, N122, idf, 0, 0, lessthanpro2[0].strip()) P123 = MyPatts2(lessthanpro3).P N123 = MyPatts2(digit).P c123 = MyClass2(P123, N123, idf, 0, 0, lessthanpro3[0].strip()) P124 = MyPatts2(lessthanpro4).P N124 = MyPatts2(digit).P c124 = MyClass2(P124, N124, idf, 0, 0, lessthanpro4[0].strip()) P125 = MyPatts2(lessthanpro5).P N125 = MyPatts2(digit).P c125 = MyClass2(P125, N125, idf, 0, 0, lessthanpro5[0].strip()) P126 = MyPatts2(lessthanpro6).P N126 = MyPatts2(digit).P c126 = MyClass2(P126, N126, idf, 0, 0, lessthanpro6[0].strip()) P127 = MyPatts2(lessthanpro7).P N127 = MyPatts2(digit).P c127 = MyClass2(P127, N127, idf, 0, 0, lessthanpro7[0].strip()) #################################################################### #################################################################### # examine only k chunks of the big file at a time while mydata.lines: i = 0 my_max = len(mydata.lines) # loop over chunk while i < my_max: # parse the chunk lines = mydata.lines line = mydata.lines[i] # build sentence sen = MySen() sen.buildSen(i, lines, my_max) # if sentence built, apply patterns if sen.end == True: # retrieve POS tagged sentence myline = sen.sen #################################################################### # corpus corpus.openSen(myline, corpus.pats, corpus.patts) #################################################################### #################################################################### # all c11.openSen2(myline, c11.pats, c11.patts) c12.openSen2(myline, c12.pats, c12.patts) c13.openSen2(myline, c13.pats, c13.patts) c17.openSen2(myline, c17.pats, c17.patts) c18.openSen2(myline, c18.pats, c18.patts) # some c25.openSen2(myline, c25.pats, c25.patts) # > k c41.openSen2(myline, c41.pats, c41.patts) c42.openSen2(myline, c42.pats, c42.patts) c43.openSen2(myline, c43.pats, c43.patts) # < k c51.openSen2(myline, c51.pats, c51.patts) c52.openSen2(myline, c52.pats, c52.patts) c53.openSen2(myline, c53.pats, c53.patts) c54.openSen2(myline, c54.pats, c54.patts) c55.openSen2(myline, c55.pats, c55.patts) # most c71.openSen2(myline, c71.pats, c71.patts) c72.openSen2(myline, c72.pats, c72.patts) c73.openSen2(myline, c73.pats, c73.patts) c74.openSen2(myline, c74.pats, c74.patts) c75.openSen2(myline, c75.pats, c75.patts) # few c81.openSen2(myline, c81.pats, c81.patts) c82.openSen2(myline, c82.pats, c82.patts) c83.openSen2(myline, c83.pats, c83.patts) c84.openSen2(myline, c84.pats, c84.patts) # > p/k c91.openSen2(myline, c91.pats, c91.patts) c92.openSen2(myline, c92.pats, c92.patts) c93.openSen2(myline, c93.pats, c93.patts) c94.openSen2(myline, c94.pats, c94.patts) c95.openSen2(myline, c95.pats, c95.patts) c96.openSen2(myline, c96.pats, c96.patts) # < p/k c121.openSen2(myline, c121.pats, c121.patts) c122.openSen2(myline, c122.pats, c122.patts) c123.openSen2(myline, c123.pats, c123.patts) c124.openSen2(myline, c124.pats, c124.patts) c125.openSen2(myline, c125.pats, c125.patts) c126.openSen2(myline, c126.pats, c126.patts) c127.openSen2(myline, c127.pats, c127.patts) #################################################################### # if a sentence is found, skip the lines it # covers in the loop, otherwise move to the # next line if sen.len > 0: i = i + sen.len # print 'senlen=', sen.len, '\n' # print 'sen= ', sen.sen, '\n' else: i = i + 1 # print 'explore at line= ', i, '\n' # move to new chunk mydata.lines = mydata.myread() #################################################################### #################################################################### # total cum count tot = (c11.count + c12.count + c13.count + c17.count + c18.count + c25.count + c41.count + c42.count + c43.count + c51.count + c52.count + c53.count + c54.count + c55.count + c71.count + c72.count + c73.count + c74.count + c75.count + c81.count + c82.count + c83.count + c84.count + c91.count + c92.count + c93.count + c94.count + c96.count + c95.count + c121.count + c122.count + c123.count + c124.count + c125.count + c126.count + c127.count) + 1 print "corpus size : " + ` corpus.count ` + " sentences" print "===================================================" print "total matches: " + ` tot ` + " GQs" #################################################################### filestats = [ c11, c12, c13, c17, c18, c25, c41, c42, c43, c51, c52, c53, c54, c55, c71, c72, c73, c74, c75, c81, c82, c83, c84, c91, c92, c93, c94, c95, c96, c121, c122, c123, c124, c125, c126, c127 ] #################################################################### self.stats[idf] = filestats #################################################################### for cla in self.classstats: for thiscls in filestats: if (thiscls.tag == cla.tag): cla.classes.append(thiscls) #################################################################### # updating the distribution #self.classAvg(self.classstats) self.classAvg2(self.classstats) #sort = self.sortClass(self.classstats) #self.classstats = sort print "###################################################"
def fileStats(self,path,fileids): # starting the title tit = "GQs by class" # stat classes C1 = MyClassStats2("ari",[],0,tit) C2 = MyClassStats2("cnt",[],0,tit) C3 = MyClassStats2("pro",[],0,tit) self.classstats = [C1,C2,C3] print "###################################################" print "GQ STATS (by class)" print "###################################################" # computing the stats for idf in fileids: #################################################################### filestats = [] mydata = OpenFile(path+'/'+idf) mydata.lines = mydata.myread() #################################################################### print "===================================================" print idf print "===================================================" #################################################################### # patterns rest = [] # corpus corpus = MyClass2([".*"],[],idf,0,0,"corpus") #################################################################### # class 1 P1 = MyPatts2(aristotelian).P N1 = MyPatts2(rest).P c1 = MyClass2(P1,N1,idf,0,0,"ari") # class 2 P2 = MyPatts2(counting).P N2 = MyPatts2(rest).P c2 = MyClass2(P2,N2,idf,0,0,"cnt") # class 3 P3 = MyPatts2(proportional).P N3 = MyPatts2(rest).P c3 = MyClass2(P3,N3,idf,0,0,"pro") #################################################################### #################################################################### # examine only k chunks of the big file at a time while mydata.lines: i = 0 my_max = len(mydata.lines) # loop over chunk while i < my_max: # parse the chunk lines = mydata.lines line = mydata.lines[i] # build sentence sen = MySen() sen.buildSen(i,lines,my_max) # if sentence built, apply patterns if sen.end == True: # retrieve POS tagged sentence myline = sen.sen #################################################################### # corpus corpus.openSen(myline,corpus.pats,corpus.patts) #################################################################### # class 1 c1.openSen(myline,c1.pats,c1.patts) #################################################################### # class 2 c2.openSen(myline,c2.pats,c2.patts) #################################################################### # class 3 c3.openSen(myline,c3.pats,c3.patts) #################################################################### # if a sentence is found, skip the lines it # covers in the loop, otherwise move to the # next line if sen.len > 0: i = i + sen.len # print 'senlen=', sen.len, '\n' #print 'sen= ', sen.sen, '\n' else: i = i + 1 # print 'explore at line= ', i, '\n' # move to new chunk mydata.lines = mydata.myread() #################################################################### #################################################################### # total cum count tot = (c1.count + c2.count + c3.count) + 1 print "corpus size : " + `corpus.count` + " sentences" print "===================================================" print "total matches: " + `tot` + " GQs" #relative frequencies c1.freq = round(c1.count/tot,2) c2.freq = round(c2.count/tot,2) c3.freq = round(c3.count/tot,2) #################################################################### filestats = [c1,c2,c3] #################################################################### self.stats[idf] = filestats #################################################################### for cla in self.classstats: for thiscls in filestats: if (thiscls.tag == cla.tag): cla.classes.append(thiscls) #################################################################### # updating the distribution self.classAvg(self.classstats) self.classAvg2(self.classstats) sort = self.sortClass(self.classstats) self.classstats = sort print "###################################################"