def adjust_wx(x): # modfied to return both slp1 # headword entries start with a <wx-headword> line and # end with a </wx-headword> line. # convert these to <slp-headword> # and </slp-headword> m = re.search(r'^<(/?)(.*?)>$',x) if m: x1 = m.group(1) x2 = m.group(2) y2 = transcoder.transcoder_processString(x2,'wx','slp1') ans = "<%s%s>" %(x1,y2) return ans # presumably, not a headword. Don't transcode xml tags outarr = [] # slp1 parts = re.split(r'(<[^>]+>)',x) # xml tags for part in parts: if not part: #why needed? pass elif part.startswith('<') and part.endswith('>'): outarr.append(part) elif part.startswith('[Page') and part.endswith(']'): outarr.append(part) else: # assume text in wx. Convert to slp1. Use specialized wx_slp1.xml y = transcoder.transcoder_processString(part,'wx','slp1') outarr.append(y) ans = ''.join(outarr) return ans
def r(text): #text1 = transcoder.transcoder_processString(text.decode('utf-8'),'deva','slp1') wordtype = wtd(text) text = transcoder.transcoder_processString(text,'deva','slp1') text = text.strip('.') url = 'http://sanskrit.inria.fr/cgi-bin/SKT/sktlemmatizer?lex=MW&q=' + text + '&t=SL&c=' + wordtype response = urllib2.urlopen(url) #print "webpage downloaded at ", #timestamp() html_doc = response.read() soup = BeautifulSoup(html_doc, 'html.parser') #print "soup made at ", #timestamp() interestingdiv = soup.find("div", { "class" : "center" }) table = interestingdiv.find("table", { "class" : "yellow_cent" }) span = table.tr.th.find("span", { "class" : "latin12" }) data = unicode(span).split('<br>\n')[1] if wordtype not in ["Part", "Piic" ]: verbattr_separator = unicode(data).split('}[') attributes = verbattr_separator[0] verbsoup = BeautifulSoup(verbattr_separator[1], 'html.parser') verb = verbsoup.a.text verb = re.sub("[0-9_]+", "", verb) verb = transcoder.transcoder_processString(verb,'roman','slp1') data = tosm(attributes) m = [] if len(data) > 1: for datum in data: m.append(verb + '.' + datum) output = '|'.join(m) else: output = verb + '.' + data[0] elif wordtype in ["Part", "Piic" ]: output = kridantaattributes(data) return output
def key_transcode(m,fromcode,tocode): x1 = m.group(1) key1=m.group(2) x2 = m.group(3) key2=m.group(4) body=m.group(5) key1a = transcoder.transcoder_processString(key1,fromcode,tocode) key2a = transcoder.transcoder_processString(key2,fromcode,tocode) out = "<H1>%s{%s}%s{%s}%s" %(x1,key1a,x2,key2a,body) return out
def transcode(self,tranin,tranout): fullroot = transcoder_processString(self.fullroot,tranin,tranout) sense = self.sense if tranout == 'deva': sense = sense.replace('"','') sense = transcoder_processString(sense,tranin,tranout) L = self.L code = self.code ans = 'fullroot=%s, sense=%s, L=%s, mwcode=%s' %( fullroot,sense,L,code) return ans
def alterations(filein,fileout): fin = codecs.open(filein,'r','utf-8') data = fin.read() fin.close() data = data.strip() print 'making preprocess changes' data = changelist(data) print "Debugging and writing to log.txt" log = codecs.open('log.txt','a','utf-8') log.write('#'+filein+"#\n") words = data.split(' ') counter=1 out = [] for i in xrange(len(words)): word = words[i] word = snchanges(word) # Creating log for श ङ issue. See https://github.com/drdhaval2785/padamanjari/issues/1 """ if re.search(r'\s["][sn]',word): changed = snchanges(word) #log.write(str(counter)+":"+word+"\n") counter = counter+1 if not changed == word: out.append(changed) else: out.append(word) # Creating log for ङ issue. See https://github.com/drdhaval2785/padamanjari/issues/2 if re.search(r'"n[^aAiIuUfFxXeEoOykglnm]',word): out.append(word) rep = word.replace('\n',' ') log.write(str(counter)+":"+rep+"\n") counter = counter+1 else: out.append(word) """ out.append(word) data = ' '.join(out) log.close() print 'changing to slp1' output = transcoder.transcoder_processString(data,'vel','slp1') #fout1 = codecs.open(fileout,'w','utf-8') #fout1.write(output) #fout1.close() output = slpchanges(output) print 'changing to Devanagari' output = transcoder.transcoder_processString(output,'slp1','deva') output = output.replace('#','') #output = output.replace('\n','<br/>') print 'putting the data in output folder' fout1 = codecs.open(fileout,'w','utf-8') fout1.write(output) fout1.close()
def alterations(filein, fileout): fin = codecs.open(filein, 'r', 'utf-8') data = fin.read() fin.close() data = data.strip() print 'making preprocess changes' data = changelist(data) print "Debugging and writing to log.txt" log = codecs.open('log.txt', 'a', 'utf-8') log.write('#' + filein + "#\n") words = data.split(' ') counter = 1 out = [] for i in xrange(len(words)): word = words[i] word = snchanges(word) # Creating log for श ङ issue. See https://github.com/drdhaval2785/padamanjari/issues/1 """ if re.search(r'\s["][sn]',word): changed = snchanges(word) #log.write(str(counter)+":"+word+"\n") counter = counter+1 if not changed == word: out.append(changed) else: out.append(word) # Creating log for ङ issue. See https://github.com/drdhaval2785/padamanjari/issues/2 if re.search(r'"n[^aAiIuUfFxXeEoOykglnm]',word): out.append(word) rep = word.replace('\n',' ') log.write(str(counter)+":"+rep+"\n") counter = counter+1 else: out.append(word) """ out.append(word) data = ' '.join(out) log.close() print 'changing to slp1' output = transcoder.transcoder_processString(data, 'vel', 'slp1') #fout1 = codecs.open(fileout,'w','utf-8') #fout1.write(output) #fout1.close() output = slpchanges(output) print 'changing to Devanagari' output = transcoder.transcoder_processString(output, 'slp1', 'deva') output = output.replace('#', '') #output = output.replace('\n','<br/>') print 'putting the data in output folder' fout1 = codecs.open(fileout, 'w', 'utf-8') fout1.write(output) fout1.close()
def unused_convertrecs(recs,tranin,tranout): "Modifies recs" n=0 for rec in recs: n=n+1 try: rec.abbrvunicode = transcoder.transcoder_processString(rec.abbrv,tranin,tranout) rec.titleunicode = transcoder.transcoder_processString(rec.title,tranin,tranout) m = re.search(r'[a-zA-Z][1-9]',rec.abbrvunicode + " " + rec.titleunicode ) if m: print "TRANSCODER WARNING: ",m.group(0).encode('utf-8') except: print "convertrecs problem",n,rec.line.encode('utf-8')
def linking(fin,fout): infile = codecs.open(fin,'r','utf-8') input = infile.readlines() input = triming(input) outfile = codecs.open(fout,'w','utf-8') #acc:akzoByatantre,41695:akzoByatantre:n:oBy -> acc:अक्षोभ्यतन्त्रे,41695:अक्षोभ्यतन्त्रे:n:oBy for line in input: [dict,headword,replica,errcode,note] = line.split(':') [hw,lnum] = headword.split(',') hw = transcoder.transcoder_processString(hw,'slp1','deva') note = transcoder.transcoder_processString(note,'slp1','deva') outfile.write(dict+':'+hw+','+lnum+':'+hw+':'+errcode+':'+note+'\n') outfile.close() print "Check", fout, "for testing"
def transcode(self,tranin,tranout): fullroot = transcoder_processString(self.fullroot,tranin,tranout) sense = self.sense if tranout == 'deva': sense = sense.replace('"','') sense = transcoder_processString(sense,tranin,tranout) #othrrootstr = transcoder_processString(self.othrrootstr,tranin,tranout) sid = self.sid code = self.code #ans = 'fullroot=%s, sense=%s, sid=%s, othrroots=%s, mwcode=%s' %( # fullroot,sense,sid,othrrootstr,code) ans = 'fullroot=%s, sense=%s, sid=%s, mwcode=%s' %( fullroot,sense,sid,code) return ans
def convertrecs(recs,tranin,tranout): "Modifies recs" n=0 for rec in recs: n=n+1 try: rec.abbrvunicode = transcoder.transcoder_processString(rec.abbrv,tranin,tranout) rec.titleunicode = transcoder.transcoder_processString(rec.title,tranin,tranout) m = re.search(r'[a-zA-Z][1-9]',rec.abbrvunicode + " " + rec.titleunicode ) if m: print "TRANSCODER WARNING: ",m.group(0).encode('utf-8') # Undo some transcodings rec.titleunicode = re.sub(r'YOLLY','JOLLY',rec.titleunicode) # JOLLY is an author except: print "convertrecs problem",n,rec.line.encode('utf-8')
def unused_adjust_hk(m): x = m.group(1) # re.split(r'(<[^>]+>)',s)(&.*;) outarr = [] parts = re.split(r'(<[^>]+>)',x) # xml tags for part in parts: if (part == ''): pass elif (part[0] == '<'): outarr.append(part) else: parts1 = re.split(r'(&.*;)',part) # xml entity for part1 in parts1: if (part1 == ''): pass elif (part1[0] == '&'): outarr.append(part1) else: # assume text in hk. Convert to slp1 z = re.sub(r'\|','.',part1) # text has non-standard | for danda if z == 'oMM': y = 'o~' # OM else: y = transcoder.transcoder_processString(z,'hk','slp1') outarr.append(y) ans = ''.join(outarr) return "<s>%s</s>" % ans
def abbrv_transcode(p): tranin = 'as' tranout = 'roman1' proman = transcoder.transcoder_processString(p,tranin,tranout) # correct some errors: proman = proman.replace('Yourn','Journ') return proman
def generator(analysedword, translit="slp1"): analysedword = unicode(analysedword) # unicode data = re.split( '|', analysedword ) # There may be cases where the data may have been analysed by our analyser. They would be separated by '|'. for datum in data: separate = re.split('-', datum) # split the whole string by '-' rootword = separate[0] # Base word taglist = separate[1:] # attributes if taglist[-1] in [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10' ] and taglist[-2] in ['verbgana', 'aoristgana', 'injunctivegana']: taglist = taglist[:-2] # Removed artificially added attributes datahavingroot = findrootword( rootword) # Created a list of possible items outlist = [] for rootdatum in datahavingroot: if set(taglist) < set( rootdatum ): # If the tags supplied are a subset of the data from XML file, outlist.append(rootdatum[-1]) # Add the word form to outlist if translit == "deva": return transcoder.transcoder_processString("|".join(outlist), 'slp1', 'deva') # Devanagari else: return "|".join(outlist) # SLP1
def disp_md(dictcode,icase,L,hw0,url,page0,datalines): """ return array of lines, formatted for details of GitHub Markdown """ outarr=[] pageref = "[page %s](%s)" %(page0,url) outarr.append(' Case %04d: %s %s ' % (icase,hw0,pageref)) datalines = adjust_datalines(dictcode,datalines) # output up to 10 lines of datalines outlines = datalines[0:10] outarr.append('```') # construct potential headword change record out = "%s:%s,%s:%s:n:" %(dictcode,hw0,L,hw0) outarr.append(out) outarr.append('') for x in outlines: # Remove '|', which is a line-separator in CAE x = re.sub(r'[|]','',x) y = transcoder.transcoder_processString(x,'as','roman') if (y.strip() != ''): outarr.append('%s' % y) if len(datalines)>10: ndiff = len(datalines) - 10 outarr.append(' [and %s more lines]' % ndiff) outarr.append('```') outarr.append('------------------------------------------') outarr.append('') return outarr
def transcode(x,tranout='slp1'): """ transcode from slp1 to tranout, unless line starts with ';' """ if x.startswith(';'): return x else: return transcoder.transcoder_processString(x,'slp1',tranout)
def jnutrimline(a, b): parts = b.split('#') gana1 = parts[1].split(',')[2] # Convert from gana name to gana number. gana = gananametonumber(gana1) return transcoder.transcoder_processString(parts[0], 'deva', 'slp1') + ':' + gana + ':' + a
def getbasengrams(forThisBook, nth): booklist = [ 'balamanorama', 'kashika', 'laghu', 'nyasa', 'samhita', 'tattvabodhini' ] padalist = [ 'pada-1.1', 'pada-1.2', 'pada-1.3', 'pada-1.4', 'pada-2.1', 'pada-2.2', 'pada-2.3', 'pada-2.4', 'pada-3.1', 'pada-3.2', 'pada-3.3', 'pada-3.4', 'pada-4.1', 'pada-4.2', 'pada-4.3', 'pada-4.4', 'pada-5.1', 'pada-5.2', 'pada-5.3', 'pada-5.4', 'pada-6.1', 'pada-6.2', 'pada-6.3', 'pada-6.4', 'pada-7.1', 'pada-7.2', 'pada-7.3', 'pada-7.4', 'pada-8.1', 'pada-8.2', 'pada-8.3', 'pada-8.4' ] result = set() for book in booklist: print book if book == forThisBook: pass else: for pada in padalist: inputdir = '../../' + book + '/' + pada inputfiles = glob.glob(inputdir + '/*.*') print inputdir for inputfile in inputfiles: fin = codecs.open(inputfile, 'r', 'utf-8') data = fin.read() text = data.split('---')[2].strip() text = transcoder.transcoder_processString( text, 'deva', 'slp1') text = re.sub('[^a-zA-Z \']+', '', text) result = result.union(getngrams(text.encode('utf-8'), nth)) fin.close() print len(result), nth, 'gram' #print result return result
def adv(text): input = text.split('.') errormessage = 'not found as a' if input[1] == 'adv': url = 'http://sanskrit.inria.fr/cgi-bin/SKT/sktlemmatizer?lex=MW&q=' + input[0] + '&t=SL&c=Advb' response = urllib2.urlopen(url).read() if errormessage not in response: return transcoder.transcoder_processString(input[0],'slp1','deva')
def simpleslp1(word): """ Apply slp1_simpleslp1 transcoder. lower case all letters in word, EXCEPT Y (palatal nasal) and R (cerebral nasal) -- Y and R are changed to 'n' in transcoder. Also, replace a doubled letter by the single letter. """ word1 = simple_lower(word) word2 = remove_double(word1) ans1 = transcoder.transcoder_processString(word2, 'slp1', 'simpleslp1lo') ans = [ans1] if 'f' in word2: # Handle other forms of 'f': ri,ru,ar for altf in ['ri', 'ru', 'ar']: word3 = re.sub('f', altf, word2) ansf = transcoder.transcoder_processString(word3, 'slp1', 'simpleslp1lo') ans.append(ansf) # allow either 'm' or 'n' before consonant a1 = mn_consonants(ans, 'm', 'n') # change mC to nC (C = consonant) a2 = mn_consonants(ans, 'n', 'm') ans = ans + a1 + a2 if 'kxp' in word2: # Handle other forms of 'x': l and also lr, lri, for altf in ['klrp', 'klrip', 'klrup', 'kalp']: word3 = re.sub('kxp', altf, word2) ansx = transcoder.transcoder_processString(word3, 'slp1', 'simpleslp1lo') ans.append(ansx) if re.search(r'ar$', ans1): # cases like pw: kar <-> kf. # This is aimed at verbs only, but the code will catch words # ending in punar for altf in ['ri', 'ru', 'r']: x = re.sub(r'ar$', altf, ans1) if x not in ans: ans.append(x) # special case of 'kalp' verb (in pw, etc) == kxp if ans1 == 'kalp': for alt in ['klp', 'klrp', 'klrip']: x = re.sub('kalp$', alt, ans1) if x not in ans: ans.append(x) # Choose to add grammar variants # in the query return ans """
def iter(wordxml, strength="Full"): if wordxml == "????": return "????" # Error message else: wordxml = unicode(wordxml) # Converted the word to unicode wordwithtags = [] # Empty list individualentries = wordxml.split("|") for individualentry in individualentries: tree = StringIO(individualentry) # Created XML from the worddata # print "parsing of iter started at", printtimestamp() context = etree.parse(tree) # Parsed the element tree. # print "parsing of iter ended at", printtimestamp() root = context.getroot() # got the root of element tree e.g. 'f' # The next two steps require explanation. In Gerard's XML files, All possible attributes are given as children of 'f'. The last child is always 's' which stores the stem. All other children are the various possible word attributes. Given as 'na' or 'v' etc. Gio children = root.getchildren()[:-1] # attributes basedata = root.getchildren()[-1] # 's' stem basewordslp = basedata.get("stem").strip() # Base word in SLP1 encoding. if strength == "deva": baseword = transcoder.transcoder_processString( basewordslp, "slp1", "deva" ) # If the user wants output in Devanagari rather than SLP1, this code converts it to Devanagari. else: baseword = basewordslp # Otherwise in SLP1. attributes = [] # An empty list to store attributes. for child in children: taglist = child.xpath( ".//*" ) # Fetches all elements (abbreviations) of a particular verb / word characteristics. output = [child.tag] # The first member of output list is the tag of element 'v', 'na' etc. output = output + [ tagitem.tag for tagitem in taglist ] # Other tags (abbreviations) and add it to output list. # The following section is commented out right now. But it would be needed for situation where we need to konw the gaNa of a verb or 7 kinds of aorist derivation. """if len(child.xpath('.//prs[@gn]')) > 0: prsgana = child.xpath('.//prs')[0].get('gn') output.append('verbgana') output.append(prsgana) elif len(child.xpath('.//aor[@gn]')) > 0: aorgana = child.xpath('.//aor')[0].get('gn') output.append('aoristgana') output.append(aorgana) elif len(child.xpath('.//inj[@gn]')) > 0: injgana = child.xpath('.//inj')[0].get('gn') output.append('injunctivegana') output.append(injgana)""" attributes.append(output) # output list is appended to attributes list. if strength == "deva": outputlist = converttodevanagari(attributes) # Devanagari else: outputlist = attributes # SLP1 for member in outputlist: wordwithtags.append( baseword + "-" + "-".join(member) ) # Created a list wordwithtags where the first member is baseword and the rest of the members are attributes separated by '-' # print "postprocessing of iter ended at", printtimestamp() return "|".join( wordwithtags ) # If there are more than one possible verb characteristics for a given form, they are shown separated by a '|'
def convertfromfile(inputfile, outputfile): f = codecs.open(inputfile, "r", "utf-8") # Opened inputfile with UTF-8 encoding. data = f.readlines() # Read the lines into a list. f.close() # Closed the inputfile. g = codecs.open(outputfile, "w", "utf-8") # Opened the outputfile with UTF-8 encoding. for datum1 in data: # For each member of data, datum1 = datum1.strip() # Removed unnecessary whitespaces. datum1 = transcoder.transcoder_processString(datum1, "deva", "slp1") # Converted from Devanagari to SLP1. dat = re.split("(\W+)", datum1) # Created a word list by exploding the sentence at word boundaries. for i in xrange(len(dat)): datum = dat[i].strip() # Clean whitespaces. if i % 2 == 0 and i != len( dat ): # Even members of datum are the words and odd members are word boundaries. Therefore, processing only even members. # print "analysis of word started", printtimestamp() x = devanagaridisplay(datum) # Analysed the even members. # print "analysis of word ended", printtimestamp() g.write( transcoder.transcoder_processString(datum, "slp1", "deva") + "(" + x + ")" ) # Wrote to the outputfile. print transcoder.transcoder_processString( datum, "slp1", "deva" ) + "(" + x + ")" # printed to the screen for the user. # print "wrote to the file", printtimestamp() else: g.write( transcoder.transcoder_processString(dat[i], "slp1", "deva") ) # For odd members, converted the word boundaries to their Devanagari counterparts. print transcoder.transcoder_processString( dat[i], "slp1", "deva" ) # For odd members, converted the word boundaries to their Devanagari counterparts. g.write("\n") # Newline character added print # Newline character printed on terminal. g.close() # Closed outputfile.
def convertline(line,tranfrom,tranto): """ """ parts=line.split('@') # 4th part is the part to convert if tranfrom == 'roman2': parts[4] = parts[4].lower() parts[4] = transcoder.transcoder_processString(parts[4],tranfrom,tranto) return '@'.join(parts)
def iter(wordxml, strength="Full"): wordxml = unicode(wordxml) # Converted the word to unicode wordwithtags = [] # Empty list individualentries = wordxml.split('|') for individualentry in individualentries: tree = StringIO(individualentry) # Created XML from the worddata context = etree.parse(tree) # Parsed the element tree. root = context.getroot() # got the root of element tree e.g. 'f' # The next two steps require explanation. In Gerard's XML files, All possible attributes are given as children of 'f'. The last child is always 's' which stores the stem. All other children are the various possible word attributes. Given as 'na' or 'v' etc. Gio children = root.getchildren()[:-1] # attributes basedata = root.getchildren()[-1] # 's' stem basewordslp = basedata.get( 'stem').strip() # Base word in SLP1 encoding. if strength == "deva": baseword = transcoder.transcoder_processString( basewordslp, 'slp1', 'deva' ) # If the user wants output in Devanagari rather than SLP1, this code converts it to Devanagari. else: baseword = basewordslp # Otherwise in SLP1. attributes = [] # An empty list to store attributes. for child in children: taglist = child.xpath( './/*' ) # Fetches all elements (abbreviations) of a particular verb / word characteristics. output = [ child.tag ] # The first member of output list is the tag of element 'v', 'na' etc. output = output + [ tagitem.tag for tagitem in taglist ] # Other tags (abbreviations) and add it to output list. # The following section is commented out right now. But it would be needed for situation where we need to konw the gaNa of a verb or 7 kinds of aorist derivation. """if len(child.xpath('.//prs[@gn]')) > 0: prsgana = child.xpath('.//prs')[0].get('gn') output.append('verbgana') output.append(prsgana) elif len(child.xpath('.//aor[@gn]')) > 0: aorgana = child.xpath('.//aor')[0].get('gn') output.append('aoristgana') output.append(aorgana) elif len(child.xpath('.//inj[@gn]')) > 0: injgana = child.xpath('.//inj')[0].get('gn') output.append('injunctivegana') output.append(injgana)""" attributes.append( output) # output list is appended to attributes list. if (strength == "deva"): outputlist = converttodevanagari(attributes) # Devanagari else: outputlist = attributes # SLP1 for member in outputlist: wordwithtags.append( baseword + "-" + "-".join(member) ) # Created a list wordwithtags where the first member is baseword and the rest of the members are attributes separated by '-' return "|".join( wordwithtags ) # If there are more than one possible verb characteristics for a given form, they are shown separated by a '|'
def convert4(datain, fileout, tranin, tranout): body = datain body1 = transcoder.transcoder_processString(body, tranin, tranout) with codecs.open(fileout, "w", 'utf-8') as f: f.write('%s\n' % body1) #y = "%s %s" % (head,body1) #fpout.write("%s\n" % y) #fp.close() #fpout.close() print "fileout=", fileout
def dev(file): f = codecs.open(file, 'r+', 'utf-8-sig') data = f.read() data = transcoder.transcoder_processString(data,'slp1','deva') data = re.sub(u'ळ्ह्', '|', data) f.close() g = codecs.open("hindidevanagariverbform.txt", "w+", "utf-8-sig") g = codecs.open("skd_deva.txt", "w+", "utf-8-sig") g.write(data) g.close()
def convertline(line,tranfrom,tranto): """ do transcoder, but don't convert [Page...] """ parts=line.split('[Page') parts[0] = transcoder.transcoder_processString(parts[0],tranfrom,tranto) if re.search(r'[a-zA-Z][0-9]',parts[0]): unconverted=True else: unconverted=False return (unconverted,'[Page'.join(parts))
def as2slp1(x): y = re.sub(r'[ +.;-]','',x) y = re.sub(r',+$','',y) y = re.sub(r'\(\?\)','',y) y = re.sub(r'\(=.*?\)','',y) y = re.sub(r'\(.*?\)$','',y) y = re.sub(r'=.*$','',y) # represent variant y = re.sub(r',.*$','',y) y = y.lower() # BURFEY represents IAST of verbs in capital letters z = transcoder.transcoder_processString(y,'as','slp1') return z
def __init__(self,line): line = line.rstrip('\r\n') m = re.search(r'^<e>.*<in>(.*?)</in> <out>(.*?)</out>',line) if not m: self.status = False #print('SLP1 skip:',line) return self.status = True self.slp1 = m.group(1) self.romanraw = m.group(2) self.roman = transcoder.transcoder_processString(self.slp1,"slp1","roman")
def output(f, tranin, tranout, body): body1 = transcoder.transcoder_processString(body, tranin, tranout) f.write('%4s: %s\n' % (tranin, body)) f.write('%s %s\n' % (tranout, body1)) outarr = [repr(c) for c in body1] out = ' '.join(outarr) f.write('unic: %s\n' % out) names = [unicodedata.name(c) for c in body1] out = ','.join(names) f.write(' : %s\n' % out) f.write('\n')
def convertline(line,tranfrom,tranto): """ do transcoder, for the """ parts=line.split('@') # 4th part is the part to convert parts[4] = transcoder.transcoder_processString(parts[4],tranfrom,tranto) if re.search(r'[a-zA-Z][0-9]',parts[4]): unconverted=True else: unconverted=False return (unconverted,'@'.join(parts))
def dev1(file): f = codecs.open(file, 'r+', 'utf-8-sig') g = codecs.open("skd_deva.txt", "w+", "utf-8-sig") data = f.readlines() for datum in data: datum = transcoder.transcoder_processString(datum,'slp1','deva') datum = re.sub(u'ळ्ह्', '|', datum) g.write(datum) print datum g.close() f.close()
def toString(self): outarr = [] try: outarr.append(self.authrec.cologneid) except: print "Link.toString error:",self.line.encode('utf-8') exit(1) outarr.append(self.linkkey) authkey1 = self.authrec.authabbrev() linkkey1 = transcoder.transcoder_processString(self.linkkey,'as1','roman') # transcode the same way it is done for ls in # correctionwork/cologne-issue-216 linkkey2a = transcoder.transcoder_processString(self.linkkey,'asls','iast') linkkey2 = transcoder.transcoder_processString(linkkey2a,'iast','iast1') outarr.append(linkkey2) outarr.append(linkkey1) outarr.append(authkey1) outarr.append(self.authrec.toString()) outarr.append(self.print_type()) out = '\t'.join(outarr) return out
def wtd(text): #text = text.decode('utf-8') text = transcoder.transcoder_processString(text,'deva','slp1') wordtype = ['Noun', 'Voca', 'Verb', 'Pron', 'Part', 'Advb', 'Abso', 'Iic', 'Ifc', 'Iiv', 'Piic'] errormessage = 'not found as a' for wordt in wordtype: url = 'http://sanskrit.inria.fr/cgi-bin/SKT/sktlemmatizer?lex=MW&q=' + text + '&t=SL&c=' + wordt response = urllib2.urlopen(url).read() if errormessage in response: pass else: return wordt
def simpleslp1(word): """ Apply slp1_simpleslp1 transcoder. lower case all letters in word, EXCEPT Y (palatal nasal) and R (cerebral nasal) -- Y and R are changed to 'n' in transcoder. Also, replace a doubled letter by the single letter. """ def sub1(m): a = m.group(1) return a.lower() regex1 = '([AIUFXEOMHKGNCJWQTDPBLVSZ])' word1 = re.sub(regex1, sub1, word) regex2 = r'(.)\1' def sub2(m): a = m.group(0) # xx return a[0] # x word2 = re.sub(regex2, sub2, word1) var = transcoder.transcoder_processString(word2, 'slp1', 'simpleslp1lo') #if word != word2: # if word.startswith('kar'): # print('dbg:',word,word1,word2,var) ans = [var] #if not re.search(r'(ar|ri|ru) # sometimes an 'ar' slp1 might also be slp1 vowel 'f'. # probably when NOT followed by a vowel # (i.e. at end or followed by consonant) regex3 = r'(ar)([^aiufeo]|$)' def sub3(m): return 'r' + m.group(2) word3 = re.sub(regex3, sub3, var) #if True and (word3 != var): # print('dbg:',word,word1,word2,word3,var) if word3 != var: ans.append(word3) # sometimes, ri should be interpreted as 'f' # when (a) at beginning or not preceded by a vowel or followed by vowel regex4 = r'(^|[^aiufeo])ri([^aiufeo]|$)' def sub4(m): return m.group(1) + 'r' + m.group(2) # drop r in ri word4 = re.sub(regex4, sub4, word3) if word4 != word3: ans.append(word4) if True: print('dbg:', word, word1, word2, var, word3, word4) return ans
def write(option,fileout,mergerecs,tranout,name1,name2): tranin = 'slp1' n = 0 nflag = 0 neq = 0 with codecs.open(fileout,"w","utf-8") as f: for imerge,mergerec in enumerate(mergerecs): rec1,rec2 = mergerec outarr1 = [] outarr2 = [] flagok = True if (rec1 == None) or (rec2 == None): flagok = False if (rec1 != None) and (rec1.k == '?'): flagok = False if (rec2 != None) and (rec2.k == '?'): flagok = False if (option == 1) and (not flagok): # skip this problem merged record continue if (option == 2) and flagok: # skip this non-problem merged record continue n = n + 1 if rec1 == None: out1 = '?' outarr1.append('%s: %s' %(name1,out1)) else: out1 = rec1.k k = rec1.k for r in rec1.x: rstr = r.transcode(tranin,tranout) outarr1.append('%s: %s' %(name1,rstr)) assert k == r.mw if rec2 == None: out2 = '?' outarr2.append('%s: %s' %(name2,out2)) else: out2 = rec2.k k = rec2.k for r in rec2.x: rstr = r.transcode(tranin,tranout) outarr1.append('%s: %s' %(name2,rstr)) assert k == r.mw outarr = [] kstr = transcoder_processString(k,tranin,tranout) outarr.append('; Case %04d: mw = %s' %(n,kstr)) outarr = outarr + outarr1 + [';'] + outarr2 + [';'] for out in outarr: f.write(out + '\n') print(n,"records written to",fileout)
def preparation(inputfile,translit='deva'): infile = codecs.open(inputfile,'r','utf-8') inputwords = infile.read().split() inputwords = triming(inputwords) output = [] for word in inputwords: word = transcoder.transcoder_processString(word,'deva','slp1') if re.search('[^A-Za-z]',word): word = re.sub('[^A-Za-z]','',word) if not word == '': output.append(word) else: output.append(word) return output
def getSKngrams(nth): result = set() fin = codecs.open('../../../siddhantakaumudi/sk1.txt', 'r', 'utf-8') for text in fin: text = re.sub(u'^[{][#]उ[0-9]+[#][}]', '', text) text = text.replace(u'(अ)', '') text = text.replace(u'(स्व)', '') text = transcoder.transcoder_processString(text, 'deva', 'slp1') text = re.sub(u'[^a-zA-Z \']+', ' ', text) text = re.sub('[ ]+', ' ', text) result = result.union(getngrams(text.encode('utf-8'), nth)) fin.close() print len(result), nth, 'gram' return result
def getSKngrams(nth): result = set() fin = codecs.open('../../../siddhantakaumudi/sk1.txt','r','utf-8') for text in fin: text = re.sub(u'^[{][#]उ[0-9]+[#][}]','',text) text = text.replace(u'(अ)','') text = text.replace(u'(स्व)','') text = transcoder.transcoder_processString(text,'deva','slp1') text = re.sub(u'[^a-zA-Z \']+',' ',text) text = re.sub('[ ]+',' ',text) result = result.union(getngrams(text.encode('utf-8'),nth)) fin.close() print len(result), nth, 'gram' return result
def unused_as2slp1_systematic(x): y = re.sub(r'-','',x) # nasals y = re.sub(r'n3([kg])',r'm3\1',y) y = re.sub(r'n5([cj])',r'm3\1',y) y = re.sub(r'm([pbm])',r'm3\1',y) y = re.sub(r'n([tdn])',r'm3\1',y) # visarga y = re.sub(r'ss','h2s',y) # alternate vant/vat or mant/mat y = re.sub(r'va\(n$','vat',y) y = re.sub(r'ma\(n$','mat',y) z = transcoder.transcoder_processString(y,'as','slp1') return z
def adjust_slp1(x): # modfied to return wx m = re.search(r'^<(/?)(.*?)>$',x) if m: x1 = m.group(1) x2 = m.group(2) y2 = transcoder.transcoder_processString(x2,'slp1','wx') ans = "<%s%s>" %(x1,y2) return ans outarr = [] # wx parts = re.split(r'(<[^>]+>)|(\[Page.*?\])',x) # xml tags for part in parts: if not part: #why needed? pass elif part.startswith('<') and part.endswith('>'): outarr.append(part) elif part.startswith('[Page') and part.endswith(']'): outarr.append(part) else: # assume text in wx. Convert to slp1. Use specialized wx_slp1.xml y = transcoder.transcoder_processString(part,'slp1','wx') outarr.append(y) ans = ''.join(outarr) return ans
def convertfromfile(inputfile,outputfile): f = codecs.open(inputfile, 'r', 'utf-8') # Opened inputfile with UTF-8 encoding. data = f.readlines() # Read the lines into a list. f.close() # Closed the inputfile. g = codecs.open(outputfile, 'w', 'utf-8') # Opened the outputfile with UTF-8 encoding. for datum1 in data: # For each member of data, datum1 = datum1.strip() # Removed unnecessary whitespaces. datum1 = transcoder.transcoder_processString(datum1, "deva", "slp1") # Converted from Devanagari to SLP1. dat = re.split('(\W+)',datum1) # Created a word list by exploding the sentence at word boundaries. for i in xrange(len(dat)): datum = dat[i].strip() # Clean whitespaces. if i % 2 == 0 and i != len(dat)-1: # Even members of datum are the words and odd members are word boundaries. Therefore, processing only even members. #print "analysis of word started", timestamp() x = devanagaridisplay(datum) # Analysed the even members. #print "analysis of word ended", timestamp() g.write(transcoder.transcoder_processString(datum, "slp1", "deva")+"("+x+")") # Wrote to the outputfile. print datum, timestamp() #print transcoder.transcoder_processString(datum, "slp1", "deva")+"("+x+")" # printed to the screen for the user. #print "wrote to the file", timestamp() else: g.write(transcoder.transcoder_processString(dat[i], "slp1", "deva")) # For odd members, converted the word boundaries to their Devanagari counterparts. g.write('\n') # Newline character added print # Newline character printed on terminal. g.close() # Closed outputfile.
def unused_transcode_line(x,tranin,tranout): """ """ if re.search(r'^\[Page.*?\]$',x): return x parts = re.split(r'(<[^>]*>)',x) newparts = [] for part in parts: if part.startswith('<'): newparts.append(part) else: newpart = transcoder.transcoder_processString(part,tranin,tranout) newparts.append(newpart) y = ''.join(newparts) return y
def toString(self): outarr = [] try: outarr.append(self.authrec.cologneid) except: print "Link.toString error:", self.line.encode('utf-8') exit(1) outarr.append(self.linkkey) authkey1 = self.authrec.authabbrev() linkkey1 = transcoder.transcoder_processString(self.linkkey, 'as1', 'roman') # transcode the same way it is done for ls in # correctionwork/cologne-issue-216 linkkey2a = transcoder.transcoder_processString( self.linkkey, 'asls', 'iast') linkkey2 = transcoder.transcoder_processString(linkkey2a, 'iast', 'iast1') outarr.append(linkkey2) outarr.append(linkkey1) outarr.append(authkey1) outarr.append(self.authrec.toString()) outarr.append(self.print_type()) out = '\t'.join(outarr) return out
def main(inlines,hwrecs,fileout,fileout1): fout=codecs.open(fileout,"w","utf-8") fout1=codecs.open(fileout1,"w","utf-8") nsystematic=0 nout=0 for hwrec in hwrecs: datalines = inlines[hwrec.linenum1-1:hwrec.linenum2] # is it a foreign word? If so, get list of languages. fw = foreignword(datalines) if len(fw) == 0: continue firstline = datalines[0] page0 = hwrec.pagecol l1 = hwrec.linenum1 l2 = hwrec.linenum2 hw0 = hwrec.hwslp nout = nout + 1 dictcode='ieg' # output to fileout out = "%s:%s:foreign %s" %(dictcode,hw0,','.join(fw)) fout.write("%s\n" % out) # output to fileout1 outarr=[] baseurl='http://www.sanskrit-lexicon.uni-koeln.de/scans/awork/apidev/servepdf.php?dict=%s'% dictcode url = '%s&page=%s' %(baseurl,page0) pageref = "[[%s][page %s]]" %(url,page0) outarr.append('* TODO Case %04d: %s %s' % (nout, hw0,pageref)) # output up to 10 lines of datalines outlines = datalines[0:10] for x in outlines: y = transcoder.transcoder_processString(x,'as','roman') outarr.append('; %s' % y) if len(datalines)>10: ndiff = len(datalines) - 10 outarr.append('; [and %s more lines]' % ndiff) # 1 extra blank line outarr.append('') fout1.write('\n'.join(outarr) + "\n") if (nout == 25) and False: print "debug",nout break pass fout.close() fout1.close() print len(hwrecs),"headword records processed" print nout,"records written to ",fileout print nout,"sections written to ",fileout1
def adjust_slp1(x): # modfied to return both outarr = [] # slp1 parts = re.split(r'(<[^>]+>)|(\[Page.*?\])',x) # xml tags for part in parts: if not part: #why needed? pass elif part.startswith('<') and part.endswith('>'): outarr.append(part) elif part.startswith('[Page') and part.endswith(']'): outarr.append(part) else: # assume text in slp. Convert to slp1. Use specialized slp1_hk.xml y = transcoder.transcoder_processString(part,'slp1','hk') outarr.append(y) ans = ''.join(outarr) return ans
def gettestngrams(forThisBook,nth): result = set() padalist=['pada-1.1','pada-1.2','pada-1.3','pada-1.4','pada-2.1','pada-2.2','pada-2.3','pada-2.4','pada-3.1','pada-3.2','pada-3.3','pada-3.4','pada-4.1','pada-4.2','pada-4.3','pada-4.4','pada-5.1','pada-5.2','pada-5.3','pada-5.4','pada-6.1','pada-6.2','pada-6.3','pada-6.4','pada-7.1','pada-7.2','pada-7.3','pada-7.4','pada-8.1','pada-8.2','pada-8.3','pada-8.4'] for pada in padalist: inputdir = '../../'+forThisBook+'/'+pada inputfiles = glob.glob(inputdir+'/*.*') print inputdir for inputfile in inputfiles: fin = codecs.open(inputfile,'r','utf-8') data = fin.read() text = data.split('---')[2].strip() text = transcoder.transcoder_processString(text,'deva','slp1') text = re.sub('[^a-zA-Z \']+','',text) result = result.union(getngrams(text.encode('utf-8'),nth)) fin.close() print len(result), nth, 'gram' return result
def adjust_hk_slp1(m): x1 = m.group(1) x2 = m.group(2) x3 = m.group(3) #partsin = re.split(r'(\[Page.*?\]|[\|.]+)',x2) partsin = re.split(r'(\[Page.*?\]|[.])',x2) # Nov 5 - 2nd pass. partsout = [x1] for part in partsin: #if re.search(r'^(\[Page.*?\]|[\|.]+)$',part): if re.search(r'^(\[Page.*?\]|[.])$',part): partsout.append('#}%s{#' % part) else: partout = transcoder.transcoder_processString(part,'hk','slp1') partsout.append(partout) partsout.append(x3) out = ''.join(partsout) return out
def add_tags1(x): # 1 = SK number, 2 = sUtra, 3 = AS number m = re.search(u'{#([फि।उ]*[0-9]+)#}(.*){@([0-9-]+)@}', x) # sUtra (in Devanagari) sutra = m.group(2).strip() # Number (in Devanagari) num = transcoder.transcoder_processString( m.group(3).strip(), 'slp1', 'deva') """ १.१.६९|अणुदित्सवर्णस्य चाप्रत्ययः|अणुदित्सवर्णस्य चाप्रत्ययः १.१.६९|१.१.६९ अणुदित्सवर्णस्य चाप्रत्ययः अणुदित्सवर्णस्य चाप्रत्ययः १.१.६९ <BR> """ result = '\n\n' + num + '|' + sutra + '|' + sutra + ' ' + num + '|' + num + ' ' + sutra + '\n' + sutra + ' ' + num + ' <BR> ' # Change dash to period. result = result.replace('-', '.') # Remove unnecessary two line breaks before the first entry. result = result.replace(u'\n\n०.०.०', u'०.०.०') return result
def disp_org(icase,wordtype,hw0,url,page0,datalines): """ return array of lines, formatted for details of Emacs org mode """ outarr=[] pageref = "[[%s][page %s]]" %(url,page0) outarr.append('* Case %04d: %s %s %s ' % (icase, wordtype,hw0,pageref)) # output up to 10 lines of datalines outlines = datalines[0:10] for x in outlines: # Remove '|', which is a line-separator in BUR x = re.sub(r'[|]','',x) y = transcoder.transcoder_processString(x,'as','roman') outarr.append('; %s' % y) if len(datalines)>10: ndiff = len(datalines) - 10 outarr.append('; [and %s more lines]' % ndiff) outarr.append('') return outarr
def convert(filein,fileout,tranin,tranout): fp = codecs.open(filein,"r",'utf-8') fpout = codecs.open(fileout,"w",'utf-8') n=0; for b in fp: exp = b.split("@") x = exp[0] exp[4] = exp[4].strip() x = x.rstrip('\r\n') y = x.lower() y = y[0].upper()+y[1:] if (y == ''): continue n=n+1 z = transcoder.transcoder_processString(y,tranin,tranout) fpout.write("%s@%s@%s@%s@%s@%s\n" % (z,exp[0],exp[1],exp[2],exp[3],exp[4])) fp.close() fpout.close() print n,"lines converted to IAST and stored in abbrvoutput/sortedcrefsiast.txt\n"
def convert(filein, fileout, tranin, tranout): fp = codecs.open(filein, "r", 'utf-8') fpout = codecs.open(fileout, "w", 'utf-8') n = 0 for b in fp: exp = b.split("@") x = exp[0] exp[4] = exp[4].strip() x = x.rstrip('\r\n') y = x.lower() y = y[0].upper() + y[1:] if (y == ''): continue n = n + 1 z = transcoder.transcoder_processString(y, tranin, tranout) fpout.write("%s@%s@%s@%s@%s@%s\n" % (z, exp[0], exp[1], exp[2], exp[3], exp[4])) fp.close() fpout.close() print n, "lines converted to IAST and stored in abbrvoutput/sortedcrefsiast.txt\n"
def adjust_hk_slp1(m): x1 = m.group(1) x2 = m.group(2) x3 = m.group(3) partsin = re.split(u'(ƒPage.*?ƒ)|([.])',x2) partsout = [x1] for part in partsin: #if re.search(u'^(ƒPage.*?ƒ)$',part): if not part: continue elif part.startswith(u'ƒ'): #re.search(u'^(ƒPage.*?ƒ)$',part): partsout.append('}%s#{' % part) elif re.search(r'^([.])$',part): partsout.append('}%s#{' % part) else: partout = transcoder.transcoder_processString(part,'hk','slp1') partsout.append(partout) partsout.append(x3) out = ''.join(partsout) return out
def postprocess(line): x = line.replace('"', '`') m = re.search( '<div>([^<]*) <span class="sUtramIndex">, ([^<]*)</span> </div><p>', x) if m: rep = '---\nindex: ' + transcoder.transcoder_processString( m.group(2), 'deva', 'slp1') + '\nsutra: ' + m.group(1) + '\nvritti: nyasa\n---\n\n' x = re.sub( '<div>([^<]*) <span class="sUtramIndex">, ([^<]*)</span> </div><p>', rep, x) x = re.sub( '<span class="sUtramIndex"><a href="([0-9.]+)[.]htm">([^<]*)</a></span>', '(\g<1>)', x) x = x.replace('<span class="prashna">', '') x = x.replace('<span class="vArtikA">', '') x = re.sub('[<][^>]*[>]', '', x) x = x.strip() x += '\n' return x
def convert(filein, fileout, tranin, tranout): fp = codecs.open(filein, 'r', 'utf-8') fpout = codecs.open(fileout, 'w', 'utf-8') n = 0 for x in fp: x = x.rstrip('\r\n') if (x == ''): continue n = n + 1 m = re.search(r'^([^ ]+) (.+)$', x) if not m: out = 'line %s is unknown: %s' % (n, x) exit(1) head = m.group(1) body = m.group(2) body1 = transcoder.transcoder_processString(body, tranin, tranout) y = '%s %s' % (head, body1) fpout.write('%s\n' % y) fp.close() fpout.close() print n, 'lines converted\n'
def convert(filein, fileout, tranin, tranout): fp = codecs.open(filein, "r", 'utf-8') fpout = codecs.open(fileout, "w", 'utf-8') n = 0 for x in fp: x = x.rstrip('\r\n') if (x == ''): continue n = n + 1 m = re.search(r'^([^ ]+) (.+)$', x) if not m: out = "line %s is unknown: %s" % (n, x) exit(1) head = m.group(1) body = m.group(2) #body = re.sub('/\|/',' # ',body); #body = preg_replace('/ +/',' ',body); body1 = transcoder.transcoder_processString(body, tranin, tranout) y = "%s %s" % (head, body1) fpout.write("%s\n" % y) fp.close() fpout.close() print n, "lines converted\n"