def main(): if len(sys.argv) != 2: print "Usage: %s inputFile" % sys.argv[0] sys.exit(1) ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8') ofile = codecs.open(sys.argv[1] + '.syllables', 'w', encoding='utf-8') prev_syllables = u"" prev_count = 0 for line in (l.strip() for l in ifile): syllables = list(ml.ClusterIter(line)) #ml.ClusterIter(line).next () if len(syllables) < 2: syllables = "".join(syllables) else: syllables = "".join(syllables[:2]) if u"\u1039" in syllables or u'\u103F' in syllables: continue if syllables == prev_syllables: prev_count += 1 else: #if (u"\u1039" in prev_syllables and prev_count > 25) or (prev_count > 50): if (prev_count > 50): ofile.write('%s\n' % (prev_syllables)) #else: #print "skipping ", prev_syllables.encode ('utf8') prev_count = 0 prev_syllables = syllables ifile.close() ofile.close()
def main(): if len(sys.argv) != 2: print "Usage: %s inputFile" % sys.argv[0] sys.exit(1) ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8') ofile = codecs.open(sys.argv[1] + '.pcol', 'w', encoding='utf-8') for line in (l.strip() for l in ifile): for c in ml.ClusterIter(line): ofile.write("%s " % c) ofile.write('\n') ifile.close() ofile.close()
def main(): if len(sys.argv) != 2: print "Usage: %s inputFile" % sys.argv[0] sys.exit(1) ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8') ofile = codecs.open(sys.argv[1] + '.prefixes', 'w', encoding='utf-8') prev_syllables = [] prev_count = 0 match = False for line in (l.strip() for l in ifile): syllables = list(ml.ClusterIter(line)) #ml.ClusterIter(line).next () if len(syllables) <= 1: match = False prev_syllables = [] prev_count = 0 continue if match: if startswith(syllables, match): #print mystr(syllables).encode ('utf-8'), " startswith ", mystr(match).encode ('utf-8') prev_count += 1 else: #print mystr(syllables).encode ('utf-8'), " not startswith ", mystr(match).encode ('utf-8') if prev_count > 5: ofile.write(mystr(match) + '\n') #else: #print "omitting ", mystr(match).encode ('utf-8') prev_syllables = syllables prev_count = 0 match = False else: match = get_longest_match(syllables, prev_syllables) #print "syllable = %s, prev_syl =%s , match = %s" %(mystr (syllables).encode ('utf-8'),\ #mystr (prev_syllables).encode ('utf-8'),\ #mystr (match).encode ('utf-8')) if len(match) < 2: #if prev_syllables: #print "omitting ", mystr(prev_syllables).encode ('utf-8') # ofile.write (mystr (prev_syllables) + '\n') prev_syllables = syllables match = False ifile.close() ofile.close()
def main(): if len(sys.argv) != 2: print "Usage: %s inputFile" % sys.argv[0] sys.exit(1) ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8') for k, line in enumerate((l.strip() for l in ifile)): #print line.encode ('utf8') try: syllables = list(ml.ClusterIter(line)) except: pass if len(syllables) == 1 and u'\u1039' not in syllables[0]: print "".join(syllables).encode('utf8') ifile.close()
def main(): if len(sys.argv) != 2: print "Usage: %s inputFile" % sys.argv[0] sys.exit(1) if (os.path.exists(sys.argv[1] + '.dpkl')): print "Found pickle." with open(sys.argv[1] + '.dpkl', 'rb') as fil: finder4 = cPickle.load(fil) else: ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8') syllables = [] skip = "per()".join(ml.puncts + ml.digits) for i, line in enumerate(l.strip() for l in ifile): progress(i) line = line.replace(" ", "") for p in skip: line = line.replace(p, "") try: p = [c for c in ml.ClusterIter(line)] for each in possible_bigram_list(p): syllables += ["".join(each)] except Exception, e: print str(e), line, c ifile.close() finder4 = BigramCollocationFinder.from_words(syllables) finder4.apply_freq_filter(10) # only bigrams that appear 3+ times # remove stopwords stopwords = [] with codecs.open('stopwords', 'r', 'utf8') as stopwordFile: for each in stopwordFile: stopwords.append(each.strip()) def contains_stopwords(*syllables): for each in stopwords: if each in syllables: return True return False finder4.apply_ngram_filter(contains_stopwords) with open(sys.argv[1] + '.dpkl', 'wb') as fil: cPickle.dump(finder4, fil)
def main(): if len(sys.argv) != 2: print "Usage: %s inputFile" % sys.argv[0] sys.exit(1) ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8') for k, line in enumerate((l.strip() for l in ifile)): print line.encode('utf8') syllables = list(ml.ClusterIter(line)) bigrams = nltk.bigrams(syllables) trigrams = nltk.trigrams(syllables) tetragrams = nltk.bigrams(["".join(x) for x in _bigrams(syllables) ]) if len(syllables) > 3 else [] for each in itertools.chain(bigrams, trigrams, tetragrams): print "".join(each).encode('utf8') ifile.close()
def main(): if len(sys.argv) != 3: print "Usage: %s syllableFile inputFile" % sys.argv[0] sys.exit(1) p = re.compile( u'(\u1004\u103a\u1039)?((\u1000\u103b\u1015\u103a|\u100b\u1039\u100b|\u100b\u1039\u100c|\u100d\u1039\u100d|\u100d\u1039\u100e|\u100f\u1039\u100d)|((\u104e\u1004\u103a\u1038|\u1000|\u1001|\u1002|\u1003|\u1004|\u1005|\u1006|\u1007|\u1008|\u1009|\u100a|\u100b|\u100c|\u100d|\u100e|\u100f|\u1010|\u1011|\u1012|\u1013|\u1014|\u1015|\u1016|\u1017|\u1018|\u1019|\u101a|\u101b|\u101c|\u101d|\u101e|\u101f|\u1020|\u1021|\u1023|\u1024|\u1025|\u1026|\u1027|\u1029|\u102a|\u103f|\u1040|\u1041|\u1042|\u1043|\u1044|\u1045|\u1046|\u1047|\u1048|\u1049|\u104a|\u104b|\u104c|\u104d|\u104e|\u104f)(\u1039\u1010\u103d|\u1039\u1000|\u1039\u1001|\u1039\u1002|\u1039\u1003|\u1039\u1005|\u1039\u1006|\u1039\u1007|\u1039\u1008|\u1039\u100b|\u1039\u100c|\u1039\u100d|\u1039\u100e|\u1039\u100f|\u1039\u1010|\u1039\u1011|\u1039\u1012|\u1039\u1013|\u1039\u1014|\u1039\u1015|\u1039\u1016|\u1039\u1017|\u1039\u1018|\u1039\u1019|\u1039\u101a|\u1039\u101b|\u1039\u101c|\u1039\u101e|\u1039\u1021)?))(\u103a)?(\u103b)?(\u103c)?(\u103d)?(\u103e)?(\u1031)?(\u102d|\u102e)?(\u102f|\u1030)?(\u1032|\u1036)?(\u102b|\u102c)?(\u1037)?(\u103a)?(\u1037)?(\u1038)?', re.U) sfile = codecs.open(sys.argv[1], 'r', encoding='utf-8') ifile = codecs.open(sys.argv[2], 'r', encoding='utf-8') ofile = codecs.open(sys.argv[2] + '.checked', 'w', encoding='utf-8') for line in (l.strip() for l in ifile): for i in ml.ClusterIter(line): if not p.search(i): ofile.write(i + '\t' + line + '\n') break sfile.close() ifile.close() ofile.close()
def main(): if len(sys.argv) != 2: print "Usage: %s inputFile" % sys.argv[0] sys.exit(1) ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8') ofile = codecs.open(sys.argv[1] + '.pwar2', 'w', encoding='utf-8') prev_syllables = [] prev_count = 0 match = [] tmp = [] pos = 0 for k, line in enumerate((l.strip() for l in ifile)): if (k % 1000) == 0: print k syllables = list(ml.ClusterIter(line)) #ml.ClusterIter(line).next () if (len(syllables) < 1): # < 2 match = [] prev_syllables = [] prev_count = 0 continue #ofile.write ("\n\ns=%s p=%s:m=%s\n" %(mystr(syllables), mystr(prev_syllables), mystr(match))) if prev_syllables: if len(match) > 1 and (len(match) + 1) < len(prev_syllables): ofile.write("%s\n" % mystr(prev_syllables[len(match):])) if match: if startswith(syllables, match): prev_count += 1 else: #for u in range (prev_count): ofile.write("%s\n" % mystr(match)) match = [] prev_syllables = [] prev_count = 0 else: match = get_longest_match(syllables, prev_syllables) #ofile.write ("==> prev_count= %d match=%s\n" %(prev_count,mystr(syllables))) if len(match) < 2: #if len(match) == 0: #ofile.write ("==> here= t=%s ps=%s\n" %(mystr(tmp),mystr(prev_syllables))) if tmp and len(get_longest_match(tmp, prev_syllables)) < 2: if len(prev_syllables) > 2: ofile.seek(pos) ofile.write("%s\n" % mystr(prev_syllables[1:])) tmp = prev_syllables match = [] prev_count = 0 else: if (len(match) + 1) < len(prev_syllables): ofile.write("%s\n" % mystr(prev_syllables[len(match):])) prev_count += 1 pos = ofile.tell() ofile.write("%s\n" % mystr(syllables)) prev_syllables = syllables ifile.close() ofile.close()