sentfolder='/Lab/Projects/sentence/sents' sentparafolder='/Lab/Projects/sentence/sentparas' import os,pytxt for sentfn in os.listdir(sentfolder): sentfnfn=os.path.join(sentfolder,sentfn) f=open(sentfnfn) t=f.read() f.close() x=[] for sentence in t.split('\n'): x+=[sentence+'\n'] ofn=os.path.join(sentparafolder,sentfn) pytxt.write(ofn,'\n'.join(x),toprint=True)
exit() import rpyd2 for i in stats: r=rpyd2.RpyD2([d for d in stats[i]]) r.plot(fn='cadences-length-'+str(i).zfill(4)+'.', x='pattern', y='oe', title='Cadences of '+str(i)+' syllables in length, measured for their observed/expected appearance at ends of sentences', boxplot=True,group='pattern',col='pattern',point=True,smooth=False,flip=True) exit() o=[] for p,count in sorted(profile_stats.items(),key=lambda lx: -lx[1]): #print p,"\t",count o+=[ [str(p),str(count)] ] import pytxt pytxt.write('cadence-stats.txt',o,toprint=True) for i in profile_eg: o=[] lines=1000 line=0 for p,count in sorted(profile_eg[i].items(),key=lambda lx: -lx[1]): #print p,"\t",count line+=1 o+=[ [str(p),str(count)] ] if line>lines: break pytxt.write('cadence-egs.'+str(i).zfill(2)+'.txt',o,toprint=True)
if embedlevel < 0 or embedlevel > embedlimit: break else: print x if x.isalpha() and x == x.upper(): continue pstr += x exit() pstr = pstr.replace(' ', ' ').strip() if not pstr: continue if not ' ' in pstr: continue # try: # pdict[pstr]+=1 # except: # pdict[pstr]=1 try: print pstr, sentstr.index(pstr) except: print "!!" * 10 print sentstr print pstr print "!!" * 10 exit() o = '' for k, v in sorted(pdict.items(), key=lambda x: -x[1]): o += str(k) + '\t' + str(v) + '\n' import pytxt pytxt.write('np-stats.txt', o, toprint=True)
x='pattern', y='oe', title='Cadences of ' + str(i) + ' syllables in length, measured for their observed/expected appearance at ends of sentences', boxplot=True, group='pattern', col='pattern', point=True, smooth=False, flip=True) exit() o = [] for p, count in sorted(profile_stats.items(), key=lambda lx: -lx[1]): #print p,"\t",count o += [[str(p), str(count)]] import pytxt pytxt.write('cadence-stats.txt', o, toprint=True) for i in profile_eg: o = [] lines = 1000 line = 0 for p, count in sorted(profile_eg[i].items(), key=lambda lx: -lx[1]): #print p,"\t",count line += 1 o += [[str(p), str(count)]] if line > lines: break pytxt.write('cadence-egs.' + str(i).zfill(2) + '.txt', o, toprint=True)
def parse2lines(fn): #ifn=sys.argv[1] ifn='/Lab/Projects/sentence/parsed/middlemarch.txt.xml' ofn=os.path.basename(ifn) f=open(ifn) t=str(f.read()) f.close() sents=t.split('<sentence ') ldlim=100 for nn in range(30,31): ld=[] dl={} df=None o=[] sentnum=0 random.shuffle(sents) print nn, "?" for sentence in sents[1:]: tokens=[] for token in sentence.split('<word>')[1:]: token=token.split('</word>')[0] tokens+=[token] if len(tokens)!=nn: continue try: x=[unicode(bb) for bb in tokens] except UnicodeDecodeError: continue sentnum+=1 if ldlim and sentnum>ldlim: break parse=sentence.split('<parse>')[1].split('</parse>')[0] pdat=parse.split() wordi=0 y=4 o+=[['sent'+str(sentnum).zfill(3)," ".join(tokens)]] for pnum in range(len(pdat)): p=pdat[pnum] try: w=tokens[wordi] except IndexError: continue pnop=p.replace('(','').replace(')','') if pnop==w: wordi+=1 if wordi>=len(tokens): break d={} d['wordnum']=wordi d['depth']=y d['sentnum']=str(sentnum).zfill(3) try: dl['sent'+str(sentnum).zfill(3)]+=[y] except KeyError: dl['sent'+str(sentnum).zfill(3)]=[] dl['sent'+str(sentnum).zfill(3)]+=[y] ld.append(d) y+=p.count(')') y-=p.count('(') if not ld: continue if ldlim and sentnum<ldlim: continue r1=rpyd2.RpyD2(dl) r2=rpyd2.RpyD2(ld) pytxt.write('sentkey.'+os.path.basename(ifn)+'.'+str(nn).zfill(3)+'.txt',o,toprint=True) #r2.plot(x='wordnum',y='depth',col='sentnum',group='sentnum',line=True,point=False) #r1.corrgram() r1.kclust(cor=True) r1.hclust(cor=True)
print x if x.isalpha() and x==x.upper(): continue pstr+=x exit() pstr=pstr.replace(' ',' ').strip() if not pstr: continue if not ' ' in pstr: continue # try: # pdict[pstr]+=1 # except: # pdict[pstr]=1 try: print pstr,sentstr.index(pstr) except: print "!!"*10 print sentstr print pstr print "!!"*10 exit() o='' for k,v in sorted(pdict.items(),key=lambda x: -x[1]): o+=str(k)+'\t'+str(v)+'\n' import pytxt pytxt.write('np-stats.txt',o,toprint=True)
def parse2lines(fn): #ifn=sys.argv[1] ifn = '/Lab/Projects/sentence/parsed/middlemarch.txt.xml' ofn = os.path.basename(ifn) f = open(ifn) t = str(f.read()) f.close() sents = t.split('<sentence ') ldlim = 100 for nn in range(30, 31): ld = [] dl = {} df = None o = [] sentnum = 0 random.shuffle(sents) print nn, "?" for sentence in sents[1:]: tokens = [] for token in sentence.split('<word>')[1:]: token = token.split('</word>')[0] tokens += [token] if len(tokens) != nn: continue try: x = [unicode(bb) for bb in tokens] except UnicodeDecodeError: continue sentnum += 1 if ldlim and sentnum > ldlim: break parse = sentence.split('<parse>')[1].split('</parse>')[0] pdat = parse.split() wordi = 0 y = 4 o += [['sent' + str(sentnum).zfill(3), " ".join(tokens)]] for pnum in range(len(pdat)): p = pdat[pnum] try: w = tokens[wordi] except IndexError: continue pnop = p.replace('(', '').replace(')', '') if pnop == w: wordi += 1 if wordi >= len(tokens): break d = {} d['wordnum'] = wordi d['depth'] = y d['sentnum'] = str(sentnum).zfill(3) try: dl['sent' + str(sentnum).zfill(3)] += [y] except KeyError: dl['sent' + str(sentnum).zfill(3)] = [] dl['sent' + str(sentnum).zfill(3)] += [y] ld.append(d) y += p.count(')') y -= p.count('(') if not ld: continue if ldlim and sentnum < ldlim: continue r1 = rpyd2.RpyD2(dl) r2 = rpyd2.RpyD2(ld) pytxt.write('sentkey.' + os.path.basename(ifn) + '.' + str(nn).zfill(3) + '.txt', o, toprint=True) #r2.plot(x='wordnum',y='depth',col='sentnum',group='sentnum',line=True,point=False) #r1.corrgram() r1.kclust(cor=True) r1.hclust(cor=True)