def cleanMP(raw): record = [] for i in range(len(raw)): d1 = {} try: if raw[i][0].find('ताम') == -1: raw[i].remove(raw[i][0]) except: print('Removed') try: d1['hname'] = raw[i][0].split(':')[1].strip() # d1['name'] = transliterate(raw[i][0].split(':')[1].strip(), DEVANAGARI, HK) d1['name'] = d1['name'].replace('sinha', 'singh') except: d1['name'] = '' try: d1['hfather/husband'] = raw[i][1].split(':')[1].strip() # d1['father/husband'] = transliterate( raw[i][1].split(':')[1].strip(), DEVANAGARI, HK) except: d1['father/husband'] = '' try: d1['age'] = '' for x in range(len(raw[i][3])): if ord(raw[i][3][x]) >= 48 and ord(raw[i][3][x]) <= 57: d1['age'] = d1['age'] + raw[i][3][x] if len(d1['age']) != 2: d1['age'] = '' except: d1['age'] = '' d1['sex'] = 'M' if any('पुरूष' in myraw for myraw in raw[i]) else 'F' # print(d1) record.append(d1) return record
def transliterate_word(text, lang='H'): for p in ",.><?/+=-_}{[]*&^%$#@!~`\"\\|:;()": text = text.replace(p, '') if text in ",.><?/+=-_}{[]*&^%$#@!~`\"\\|:;()": return [] text = text.encode('utf-8') dict_to_use = hindi_dictionary if lang == 'H' else marathi_dictionary found_direct = False immediate_tranlisteration = transliterate(text, HK, DEVANAGARI) attempt = list(dict_to_use.find({'word': immediate_tranlisteration})) if attempt: return [attempt[0]['word']] attempt = list(dict_to_use.find({'transliterated': {'$in': [text, text.lower()]}})) if attempt: return [attempt[0]['word']] google_transliteration = google_transliterate(text, lang) for tr in google_transliteration: ll = list(dict_to_use.find({'word': tr})) if ll and len(ll) > 0: return [li['word'] for li in ll] sound = soundex(text) attempt = list(dict_to_use.find({'sound': sound})) neighbors = sorted([(i['word'], distance(text, i['transliteration'])) for i in attempt], key=lambda x: x[1]) return [u[0] for u in neighbors if u[1] < 3]
def train(train_ratio, test_ratio): MFILE = 'sanscript4.crfsuite' count = 0 train_sents = [] test_sents = [] with open(RESOURCE_FOLDER+'/Output/all-words-pairs.txt') as infile: for line in infile: count += 1 line = line.strip() e, h = line.split('\t') e = e # .decode('utf-8') h = h.decode('utf-8') ie = transliterate(h, DEVANAGARI, HK).encode('utf-8') if random() < train_ratio: train_sents.append((e, ie)) if random() > test_ratio: test_sents.append((e, h)) print '[*] NOT TRAINING' X_train = [s[0] for s in train_sents] y_train = [s[1] for s in train_sents] X_test = [s[0] for s in test_sents] y_test = [s[1] for s in test_sents] print 'Loading trainer...' print len(X_train), len(y_train) print len(X_test), len(y_test) trainer = pycrfsuite.Trainer(verbose=False) count = 0 skipped = 0 for xseq, yseq in zip(X_train, y_train): if count % 100 is 0: print count try: xss = [xs for xs in ngrams(xseq)] yss = [ys for ys in ngrams(yseq)] for xs in xss: for ys in yss: count += 1 trainer.append([xs], [ys]) except Exception, e: print str(e) skipped += 1
print '[Done]' tagger = pycrfsuite.Tagger() tagger.open(MFILE) print '[Begin Testing]...' count = 0 print 'Writing to output.txt' with codecs.open('output.txt', 'w', encoding='utf-8') as outfile: for xseq, yseq in zip(X_test, y_test): count += 1 if count % 100 is 0: print count xss = [xs for xs in esplitclusters(xseq)] ypredicted = tagger.tag(xss) ypredicted2 = tagger.tag([xss]) # print xss, ypredicted _ie = transliterate(''.join(ypredicted), HK, DEVANAGARI) v1 = "%s\t" % xseq v2 = "%s\t" % yseq v3 = "%s\t" % [u.decode('utf-8') for u in ypredicted] v4 = "%s\t" % ''.join(ypredicted).decode('utf-8') v5 = "%s\t" % [u.decode('utf-8') for u in ypredicted2] v6 = "%s\t" % ','.join(ypredicted2).decode('utf-8') v7 = "%s\t" % _ie val = v1 + v2 + v3 + v4 + v5 + v6 + v7 + '\n' outfile.write(val) if __name__ == '__main__': train(0.04, 0.2) # print ngrams('arshad') # print ngrams('something')
db = conn.sentiment_analysis_db import pickle path = '../files/hindi/' word2Synset = pickle.load(open(path + "WordSynsetDict.pk")) # dmetaphone = fuzzy.DMetaphone() soundex = fuzzy.Soundex(4) print db.hindi_dictionary.drop_indexes() print db.hindi_dictionary.remove({}) words = [] for word in word2Synset.keys(): transliterated = strip_non_ascii(transliterate(word, DEVANAGARI, HK)) synsets = [] for vv in word2Synset[word].values(): synsets.extend(vv) lower = transliterated.lower() sound = soundex(lower.decode('ascii', errors='ignore')) words.append({ 'word': word, 'synsets': synsets, 'transliteration': lower, 'sound': sound }) if len(words) > 1000: db.hindi_dictionary.insert_many(words) words = []
def start(): size_diff, total = 0, 0 sum_total = defaultdict(int) with open(RESOURCE_FOLDER + '/Output/all-words-pairs.txt') as infile: for line in infile: total += 1 line = line.strip() e, h = line.split('\t') e = e.decode('utf-8') h = h.decode('utf-8') op = transliterate(h, DEVANAGARI, HK) xss = [u for u in ngrams(e)] yss = [u for u in ngrams(op)] if total > 2000: break for _u in xss: for _v in yss: # print _u # print _v u = _u[0] u_prev = _u[1] u_next = _u[2] v = _v[0] v_prev = _v[1] v_next = _v[2] pointers[u][v] += 1 for i in xrange(len(u_prev)): x_previous_tags[u][u_prev[i:]] += 1 for j in xrange(1, len(u_next)): x_next_tags[u][u_next[:j]] += 1 for i in xrange(len(v_prev)): y_previous_tags[v][v_prev[i:]] += 1 for j in xrange(1, len(v_next)): x_next_tags[v][v_next[:j]] += 1 sum_total[u] += len(yss) if total % 1000 is 0: print total print len(pointers), '...' print 'Normalizing and saving probabilities' with codecs.open('probabilities.txt', 'w', encoding='utf8') as outfile: for k, vdict in pointers.iteritems(): tot = sum_total[k] values = sorted([(u, (v * factor(k, u)) / tot) for (u, v) in vdict.iteritems()], reverse=True, key=lambda x: x[1]) ll = int(len(vdict) * .3) if ll > 10: ll = 10 values = values[:ll] for value in values: u, v = value if v < 0.001: continue outfile.write("%s\t%s\t%f\n" % (k, u, v)) outfile.flush() print 'Writing x previous' with codecs.open('x_previous.txt', 'w', encoding='utf8') as outfile: for k, vdict in x_previous_tags.iteritems(): for u, v in vdict.iteritems(): outfile.write("%s\t%s\t%f\n" % (k, u, v)) outfile.flush() print 'Writing x next' with codecs.open('x_next.txt', 'w', encoding='utf8') as outfile: for k, vdict in x_next_tags.iteritems(): for u, v in vdict.iteritems(): outfile.write("%s\t%s\t%f\n" % (k, u, v)) outfile.flush() print 'Writing y previous' with codecs.open('y_previous.txt', 'w', encoding='utf8') as outfile: for k, vdict in y_previous_tags.iteritems(): for u, v in vdict.iteritems(): outfile.write("%s\t%s\t%f\n" % (k, u, v)) outfile.flush() print 'Writing y next' with codecs.open('y_next.txt', 'w', encoding='utf8') as outfile: for k, vdict in y_next_tags.iteritems(): for u, v in vdict.iteritems(): outfile.write("%s\t%s\t%f\n" % (k, u, v)) outfile.flush() print 'Total:', total
from Levenshtein import distance import fuzzy import sys conn = MongoClient() db = conn.sentiment_analysis_db soundex = fuzzy.Soundex(4) for line in open('../../resources/word-frequency-hindi.txt'): line = line.strip() word, freq = line.split('\t') word = word.decode('utf-8') # .replace('\0xef', '') found = db.hindi_dictionary.find_one({'word': word}) if not found: transliterated = transliterate(word, DEVANAGARI, HK) transliterated = strip_non_ascii(transliterated) found = db.hindi_dictionary.find_one( {'transliterated': transliterated}) if not found: sound = soundex(transliterated) sounding_same = list(db.hindi_dictionary.find({'sound': sound})) if len(sounding_same) > 0: found = sorted([(i['word'], distance(word, i['word'])) for i in sounding_same], key=lambda x: x[1])[0][0] else: found = found['word'] else: found = found['word'] print word, found