def get_tokenized_sentences(text, language): lang = "" if language == "english": lang = "en" if language == "hindi": lang = "hi" if language == "telugu": lang = "te" tk = Tokenizer(lang=lang, split_sen=True) tokens = tk.tokenize(text) return get_sentences(tokens)
class Root(Tk): def __init__(self): super(Root, self).__init__() self.title("Tkinter Browse") self.minsize(1000, 600) self.labelFrame = ttk.Label(self, text="") self.labelFrame.grid(row=0, column=1) self.label_frame = ttk.Frame(self, height=50) self.label_frame.grid( row=10) # Stops child widgets of label_frame from resizing it self.entry = ttk.Entry(self.labelFrame, text="", width=50) self.entry.grid() self.button() def button(self): self.button = ttk.Button(self.labelFrame, text="Browse", command=self.filedialog) self.button.grid(column=3, row=0) #self.entry1=ttk.Entry(self.label_frame,width=50) # self.entry1.grid(row=10) ## self.label1=ttk.Label(self.labelFrame,text="",width=100) #self.label1.grid(row=10) def filedialog(self): #root.filename = tkFileDialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("jpeg files","*.jpg"),("all files","*.*"))) self.filename = filedialog.askopenfilename(initialdir="/", title="Select file") self.entry.insert(0, self.filename) self.text = docxpy.process(self.filename) print(self.text) self.txt = ScrolledText(self.labelFrame, width=100, height=20) self.txt.grid(row=7) self.txt.insert(const.END, self.text) #self.label1.config(text=self.text) # self.entry1.insert(0,self.text) self.token() def entry(self): self.entry = ttk.Entry(self.labelFrame, text="", width=50) self.entry.grid(column=1, row=6, padx=5, pady=60, ipady=3) return def token(self): self.tkn = Tokenizer(lang='ml', smt=True) #smt is a flag for social-media-text #self.text = docxpy.process(self.filename) print(self.tkn.tokenize(self.text))
def tokenize_data(data_path, lang, forcesave=False): """ Load and save tokenized data """ tokenized_data_path = data_path + ".ptok" if not forcesave: if path.exists(tokenized_data_path): logger.info("Tokenized file already present at {}".format( tokenized_data_path)) n_sents = 0 with codecs.open(tokenized_data_path, "rb") as fp: n_sents = len(pickle.load(fp)) return tokenized_data_path, n_sents data_tuple = [] n_sents = 0 with codecs.open(data_path, 'r', encoding='utf-8') as fp: logger.info("Loading whole data in memory ...") textlines = fp.readlines() tok = Tokenizer(lang=lang, split_sen=True) tokenized_sents = tok.tokenize_lines(textlines) for tokens in tokenized_sents: sent = [] for token in tokens: # Necessary to use as tuple for caching # while generating features based on # previous, current and next word sent.append((token, "", "")) data_tuple.append(sent) n_sents += 1 logger.info("Tokenization done") with codecs.open(tokenized_data_path, "wb") as wt: logger.info("Writing data into pickle format") pickle.dump(data_tuple, wt, protocol=pickle.HIGHEST_PROTOCOL) logger.info("Data written") return tokenized_data_path, n_sents
def load_data(text_type, filename, lang, tokenize_text=False, split_sent=True): data_tuple = [] with codecs.open(filename, 'r', encoding='utf-8') as fp: logger.info('Loading text_type: %s format' % (text_type)) if text_type == "ssf": start_c = -1 for line in fp: line = line.strip() ds = line.split() #print("Line", line) #print("DS", ds) if line == "": continue elif line[0:2] == "<S": sent = [] elif line[0:3] == "</S": data_tuple.append(sent) elif line[0] == "<": continue elif ds[0] == "0" or ds[0] == "))": continue elif ds[1] == "((": start_c, chunk_tag = 1, ds[2] #print "hello-chunk tag",chunk_tag if len(ds) > 2: if ds[2]: #print "--",line,"--" word, tag = ds[1], ds[2] if start_c == -1: sent.append((word, tag, "")) if start_c == 1: sent.append((word, tag, "B-%s" % (chunk_tag))) start_c = 0 if start_c == 0: sent.append((word, tag, "I-%s" % (chunk_tag))) elif text_type == "conll": sent = [] for line in fp: line = line.strip() ds = line.split() if line != "": print(line) if len(ds) == 2: word, tag, chunk = ds[1], "","" if len(ds) == 3: word, tag, chunk = ds[1], ds[2], "" if len(ds) == 4: word, tag, chunk = ds[1], ds[2], ds[3] sent.append([word, tag, chunk]) else: data_tuple.append(sent) sent = [] elif text_type == "txt": if split_sent == True: text = fp.read() tok = Tokenizer(lang=lang, split_sen=split_sent) tokenized_sents = tok.tokenize(text) sent = [] for tokens in tokenized_sents: for token in tokens: sent.append([token, "", ""]) data_tuple.append(sent) else: for line in fp: sent = [] if tokenize_text: tok = Tokenizer(lang=lang, split_sen=False) tokenized_sents = tok.tokenize(line) for tokens in tokenized_sents: for token in tokens: sent.append([token, "", ""]) data_tuple.append(sent) else: print("Check - text_type", text_type) return data_tuple
def pipeline(): curr_dir = path.dirname(path.abspath(__file__)) args = get_args() output_dir = path.join(path.dirname(path.abspath(__file__)), "outputs") if not os.path.exists(output_dir): os.makedirs(output_dir) data_writer.set_logger(args.model_type, output_dir) if True: model_path = "%s/models/%s/%s.%s.%s.model" % ( curr_dir, args.language, args.model_type, args.tag_type, args.encoding) if args.model_type == "lstm": if args.tag_type == "pos": model_path = "%s/models/%s/lstm/" % (curr_dir, args.language) elif args.tag_type == "chunk": model_path = "%s/models/%s/lstm/chunk/" % (curr_dir, args.language) elif args.tag_type == "ner": model_path = "%s/models/%s/lstm/ner/" % (curr_dir, args.language) if args.tag_type != "parse": if not os.path.exists(model_path): os.makedirs(model_path) if args.pipeline_type == 'train': logger.info('Start Training#') logger.info('Tagger model type: %s' % (args.model_type)) data_path = "%s/data/train/%s/train.%s.%s" % ( curr_dir, args.language, args.encoding, args.data_format) if args.tag_type == "ner": data_path = data_path + ".ner" data_sents = data_reader.load_data(args.data_format, data_path, args.language) no_words = sum(len(sent) for sent in data_sents) logger.info("No. of words: %d" % (no_words)) logger.info("No. of sents: %d" % (len(data_sents))) X_data = [ generate_features.sent2features(s, args.tag_type, args.model_type) for s in data_sents ] y_data = [ generate_features.sent2labels(s, args.tag_type) for s in data_sents ] X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.10, random_state=42) print('Train data size:', len(X_train), len(y_train)) print('Test data size:', len(X_test), len(y_test)) print('Lang:', args.language) print('Train data: ', data_path) print('Model Path: ', model_path) if args.model_type == "crf": tagger = CRF(model_path) tagger.train(X_train, y_train) tagger.load_model() tagger.test(X_test, y_test) elif args.model_type == "lstm": x_data, y_data1, y_data2 = load_data_and_labels(data_path) if args.tag_type == "pos": x_train, x_test, y_train1, y_test1 = train_test_split( x_data, y_data1, test_size=0.10, random_state=42) #Split the data into train and test model = Sequence() #Intialize BiLSTM model model.fit(x_train, y_train1, epochs=10) #Train the model for 10 echos print(model.score(x_test, y_test1)) #Run the model on test data model.save(model_path + "/weights.h5", model_path + "/params.json", model_path + "/preprocessor.json") if args.tag_type == "chunk": x_train, x_test, y_train2, y_test2 = train_test_split( x_data, y_data2, test_size=0.10, random_state=42) #Split the data into train and test model = Sequence() #Intialize BiLSTM model model.fit(x_train, y_train2, epochs=10) #Train the model for 10 echos print(model.score(x_test, y_test2)) #Run the model on test data model.save(model_path + "/weights.h5", model_path + "/params.json", model_path + "/preprocessor.json") if args.tag_type == "ner": x_train, x_test, y_train1, y_test1 = train_test_split( x_data, y_data1, test_size=0.10, random_state=42) #Split the data into train and test model = Sequence() #Intialize BiLSTM model model.fit(x_train, y_train1, epochs=10) #Train the model for 10 echos print(model.score(x_test, y_test1)) #Run the model on test data model.save(model_path + "/weights.h5", model_path + "/params.json", model_path + "/preprocessor.json") if args.pipeline_type == "test": if args.model_type == "crf": test_data_path = "%s/%s" % (curr_dir, args.test_data) test_sents = data_reader.load_data(args.data_format, test_data_path, args.language, tokenize_text=False) X_test = [ generate_features.sent2features(s, args.tag_type, args.model_type) for s in test_sents ] y_test = [ generate_features.sent2labels(s, args.tag_type) for s in test_sents ] tagger = CRF(model_path) tagger.load_model() tagger.test(X_test, y_test) if args.pipeline_type == "predict": test_data_path = "%s" % (args.test_data) test_sents = data_reader.load_data(args.data_format, test_data_path, args.language, tokenize_text=True, split_sent=args.sent_split) if args.tag_type == "parse": #Pos tagging X_test = [ generate_features.sent2features(s, "pos", args.model_type) for s in test_sents ] tag_model_path = "%s/models/%s/%s.%s.%s.model" % ( curr_dir, args.language, args.model_type, "pos", args.encoding) chunk_model_path = "%s/models/%s/%s.%s.%s.model" % ( curr_dir, args.language, args.model_type, "chunk", args.encoding) if args.model_type == "crf": tagger = CRF(tag_model_path) tagger.load_model() y_pos = tagger.predict(X_test) test_sents_pos = generate_features.append_tags( test_sents, "pos", y_pos) X_test = [ generate_features.sent2features(s, "chunk", args.model_type) for s in test_sents_pos ] chunker = CRF(chunk_model_path) chunker.load_model() y_chunk = chunker.predict(X_test) test_fname = path.basename(test_data_path) output_file = "%s/%s.parse" % (output_dir, test_fname) data_writer.write_anno_to_file(output_file, test_sents_pos, y_chunk, "chunk") logger.info("Output in: %s" % output_file) data_writer.write_to_screen(output_file) else: X_test = [ generate_features.sent2features(s, args.tag_type, args.model_type) for s in test_sents ] if args.model_type == "crf": tagger = CRF(model_path) tagger.load_model() y_pred = tagger.predict(X_test) data_writer.write_anno_to_file(args.output_path, test_sents, y_pred, args.tag_type) data_writer.write_to_screen(args.output_path) logger.info("Output in: %s" % args.output_path) if args.model_type == "lstm": model = Sequence().load(model_path + "/weights.h5", model_path + "/params.json", model_path + "/preprocessor.json") f = open(args.test_data, "r") sent = f.read() tok = Tokenizer(lang=args.language, split_sen=True) tokenized_sents = tok.tokenize(sent) for tokens in tokenized_sents: for token in tokens: sent = sent + " " + token sent = sent.strip() print(model.analyze(sent))
# -*- coding: utf-8 -*- from __future__ import unicode_literals from indictrans import Transliterator from polyglot_tokenizer import Tokenizer flag = True s = 'hin' t = 'eng' forward_transl_full = Transliterator(source=s, target=t, build_lookup=True) forward_transl_token = Transliterator(source=s, target=t, decode='beamsearch') back_transl_token = Transliterator(source=t, target=s, build_lookup=True) tk = Tokenizer(lang=s[:2]) tk_back = Tokenizer(lang=t[:2]) l = u"रज्ज के रुलाया" #\nरज्ज के हंसाया\n\nमैंने दिल खो' के इश्क़ कमाया\n" l = l.lower().strip() lines = l.split("\n") print(lines) output = [] if flag == True: for l in lines: json = {} definitive = forward_transl_full.transform(l)
from __future__ import unicode_literals from polyglot_tokenizer import Tokenizer tk = Tokenizer(lang='ml', smt=True) #smt is a flag for social-media-text text = "രണ്ട് വർഷംമുമ്പ് നടന്ന നിയമസഭാ തെരഞ്ഞെടുപ്പിൽ തിരിച്ചടി ലഭിച്ചതിനുശേഷം ഗുജറാത്തിൽ ബിജെപിയുടേത് ഒരുതരം ഞാണിൻമേൽക്കളിയാണ്. കഴിഞ്ഞ ലോക്സഭാ തെരഞ്ഞെടുപ്പിൽ ആകെയുള്ള 26 സീറ്റിലും വിജയിച്ച ബിജെപി ഇക്കുറി അത് നിലനിർത്താനായി എല്ലാ വൃത്തികെട്ട കളിയും പുറത്തെടുക്കുകയാണ്." print(tk.tokenize(text))
def process_args(args): if not (args.ml or args.rb): args.rb = True if args.infile: ifp = io.open(args.infile, encoding='utf-8') else: if sys.version_info[0] >= 3: ifp = codecs.getreader('utf8')(sys.stdin.buffer) else: ifp = codecs.getreader('utf8')(sys.stdin) if args.outfile: ofp = io.open(args.outfile, mode='w', encoding='utf-8') else: if sys.version_info[0] >= 3: ofp = codecs.getwriter('utf8')(sys.stdout.buffer) else: ofp = codecs.getwriter('utf8')(sys.stdout) # LIMIT CASES # BUGS INDIC-TRANS # HARD_CODED if args.target == "urd" or args.source == 'urd': args.build_lookup = False # SELECT REGEX TO SEARCH WORDS OFFSETS INSIDE A DOCUMENT (INDIAN -> UTF, ENG -> ASCII) if args.source == 'eng' and args.target in ISO_3to2 and args.target != 'eng': # UTF8 unicode parser regex def my_regex(word): return r"(?<!\S){}(?!\S)".format(re.escape(word)) else: # ASCII romanized parser regex def my_regex(word): return r"\b{}\b".format(re.escape(word)) if args.output_format == 'stdout': # initialize transliterator object trn = Transliterator(args.source, args.target, rb=args.rb, build_lookup=args.build_lookup) # transliterate text for line in ifp: if args.source == 'hin' and args.target == 'eng': replacements = { u"\u0950": "om", u"\u0915\u092e\u0932": "kamal" } for script, roman in replacements.iteritems(): line = line.replace(script, roman) tline = trn.convert(line) if u"whatsapp" in tline and args.target == 'eng' and u"whatsapp" not in line: tline = tline.replace("whatsapp", "vhaatsapp") ofp.write(tline) # close files ifp.close() ofp.close() elif args.output_format == 'json': # getting source language from terminal source = args.source # getting target language from terminal target = args.target # Full forward ( source lang -> target lang) transliterator at SENTENCE LEVEL forward_transl_full = Transliterator(source=source, target=target, rb=args.rb, build_lookup=args.build_lookup) # forward ( source lang -> target lang) transliterator at TOKEN LEVEL, we use this to trasliterate every token indipendently # from source to target lang with multiple choices (beamsearch) forward_transl_token = Transliterator(source=source, target=target, rb=args.rb, decode='beamsearch') # backward ( target lang -> source lang) transliterator at TOKEN LEVEL, we use this to check backtranslitteration of result back_transl_token = Transliterator(source=target, target=source, rb=args.rb, build_lookup=args.build_lookup) # Tokenizer of source language tk = Tokenizer(lang=ISO_3to2[source]) # Tokenizer of target language tk_back = Tokenizer(lang=ISO_3to2[target]) # Soundex instance object for checking phonetically similarity between words instance = Soundex() # array of output sentences output = [] # seen vector to recognize words that already have been processed seen = {} # read entire source text to transliterate document_input = ifp.read() # document_input divided by lines lines = document_input.splitlines() # progressive translitted text document_translitted = u"" # for every line for l in lines: # Treat special cases if u"\u0950" in l and source == 'hin' and target == 'eng': l = l.replace(u"\u0950", "om") # prepare a json for every line json = {} #transform entire sentence as first choice definitive = forward_transl_full.transform(l) # add traslitted line to entire translitted text document_translitted += definitive + u"\n" # tokenize initial sentence in tokens tokens = tk.tokenize(l) #backtokenize text transformed back_tokens = tk_back.tokenize(clean_str(definitive)) # text field is sentence first choice without alternatives (stdout mode) #json["tokenization"] = back_tokens json["tokens"] = [] # count index token inside a sentence, without punctuation count_tokens = 0 # zip token and translitterated token for index, (t, choosen) in enumerate(zip(tokens, back_tokens)): inner_json = {} # suggestions for choosen tokens suggestions = [] exclusions = [] # translitterate token from source sentence with beamsearch ( 5 results) forward_out = forward_transl_token.transform(t) # open alternatives for c in forward_out: # for every alternatives, back-translitterate it back_out = back_transl_token.transform(c) # if back-translitterated token is equal to initial token, but the result of translitteration of two token is different, one is suggestion . if back_out == t and c != choosen: suggestions.append(clean_str(c)) else: if c != choosen: exclusions.append(clean_str(c)) # add choosen to all possible choices [ suggestion + choice] all_possible_choices = list(suggestions) all_possible_choices.insert(0, choosen) # transform all suggestion (+ choosen) to phonetical alphabet with soundex transformed = [] for c in all_possible_choices: p = instance.soundex(c) transformed.append(p) duplicates = {} for p, original_text in zip(transformed, all_possible_choices): if p not in duplicates: duplicates[p] = [] duplicates[p].append(clean_str(original_text)) else: duplicates[p].append(clean_str(original_text)) new_duplicates = {} suggestion_duplicates = [] # for every for _, v in duplicates.items(): new_duplicates[v[0]] = v[1:] suggestion_duplicates.extend(v[1:]) #my_regex = u'(\s|^)%s(\s|$)' % choosen #my_regex = r"\b" + re.escape(choosen) + r"\b" if source == 'kan' and target == 'eng': new_choosen = resolveKannada(choosen, exclusions) if new_choosen != choosen: exclusions.remove(new_choosen) exclusions.append(choosen) if choosen in new_duplicates: new_duplicates[new_choosen] = new_duplicates.pop( choosen) if new_choosen not in json["text"]: json["text"] = json["text"].replace( choosen, new_choosen) new_last_line = document_translitted.strip().split( u"\n")[-1].replace(choosen, new_choosen) document_translitted = u'\n'.join( document_translitted.split(u"\n") [0:-2]) + "\n" + new_last_line + "\n" choosen = new_choosen if (source == 'hin' or source == 'kan') and target == 'eng': new_choosen = resolveHindi(choosen, exclusions) if new_choosen != choosen: exclusions.remove(new_choosen) exclusions.append(choosen) if choosen in new_duplicates: new_duplicates[new_choosen] = new_duplicates.pop( choosen) if new_choosen not in json["text"]: json["text"] = json["text"].replace( choosen, new_choosen) new_last_line = document_translitted.strip().split( u"\n")[-1].replace(choosen, new_choosen) document_translitted = u'\n'.join( document_translitted.split(u"\n") [0:-2]) + "\n" + new_last_line + "\n" choosen = new_choosen if target == 'eng': if choosen == "whatsapp" and t != "whatsapp": definitive = definitive.replace( "whatsapp", "vhaatsapp") choosen = "vhaatsapp" new_last_line = document_translitted.strip().split( u"\n")[-1].replace("whatsapp", "vhaatsapp") document_translitted = u'\n'.join( document_translitted.split(u"\n") [0:-2]) + "\n" + new_last_line + "\n" exclusions.append("whatsapp") if choosen == t: exclusions = [] json["text"] = definitive r = re.compile(my_regex(choosen), flags=re.I | re.X | re.UNICODE) # calculate length of this choosen token length = len([1 for c in choosen if not c in UNICODE_NSM_ALL]) for m in r.finditer(document_translitted.strip()): # take every occurrence found inside full text word = m.group() characterOffsetBegin = m.start() characterOffsetEnd = characterOffsetBegin + length - 1 found = -1 if word in seen: found = seen[word] if characterOffsetBegin > found: count_tokens += 1 seen[word] = characterOffsetEnd inner_json["source"] = t inner_json["token"] = choosen inner_json["index"] = count_tokens inner_json["duplicates"] = new_duplicates inner_json["exclusions"] = exclusions inner_json["suggestions"] = [ s for s in suggestions if s not in suggestion_duplicates ] inner_json[ 'characterOffsetBegin'] = characterOffsetBegin inner_json['characterOffsetEnd'] = characterOffsetEnd json["tokens"].append(inner_json) break output.append(json) final_output = {"sentences": output} r = js.dumps(final_output) ofp.write(r) # close files ifp.close() ofp.close() else: # close files ifp.close() ofp.close()
def token(self): self.tkn = Tokenizer(lang='ml', smt=True) #smt is a flag for social-media-text #self.text = docxpy.process(self.filename) print(self.tkn.tokenize(self.text))