def load_data(): lang1 = "eng" lang2 = "fra" print("Reading lines...") lines = open('../assets/data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n') pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines] input_lang = Lang(lang1) output_lang = Lang(lang2) print("Read %s sentence pairs" % len(pairs)) print("Trimmed to %s sentence pairs" % len(pairs)) print("Counting words...") for pair in pairs: input_lang.add_sentence(pair[0]) output_lang.add_sentence(pair[1]) print("Counted words:") print(input_lang.name, input_lang.n_words) print(output_lang.name, output_lang.n_words) return input_lang, output_lang, pairs
def set_advt_frame(self, parent): self.setting_panel = wx.Panel(parent) self.setting_panel.SetSizer(self.setting_sizer) parent.AddPage(self.setting_panel, self.module_name) self.set_load_module() self.opt_sizer = wx.BoxSizer(wx.VERTICAL) self.grid_sizer = wx.FlexGridSizer(rows=15, cols=2) for opt in self.exists_options: lbl = wx.StaticText(self.setting_panel, -1, opt) self.cfg_ctr[opt] = txt = wx.TextCtrl(self.setting_panel, -1, self.cfg['PHP'][opt]) self.grid_sizer.Add(lbl, 0, wx.ALL, 5) self.grid_sizer.Add(txt, 0, wx.ALL, 3) conf_btn = wx.Button(self.setting_panel, -1, Lang().get('php_config_file')) conf_btn.Bind(wx.EVT_BUTTON, self.open_config_file) save_btn = wx.Button(self.setting_panel, -1, Lang().get('php_save_config')) save_btn.Bind(wx.EVT_BUTTON, self.save_config) self.handler_sizer = wx.BoxSizer(wx.HORIZONTAL) self.handler_sizer.AddMany([(conf_btn), (save_btn)]) self.opt_sizer.Add(self.grid_sizer) self.opt_sizer.Add(self.handler_sizer, 0, wx.TOP, 5) self.setting_sizer.Add(self.opt_sizer, 0, wx.ALL, 5)
def readLang(filepath, reverse=False): print("reading lines...") # read the file and split into lines lines = open(filepath, encoding='utf-8').read().strip().split('\n') # getting the language names from filename # filename = os.path.splitext(os.path.basename(filepath))[0] # lang = filename.split('-') # split every line into pairs and normalize # pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines] # For chatbot, normalize only input side pairs = [] for l in lines: split = l.split('\t') pairs.append([split[0], split[1]]) # pairs.append( [util.normalize_no_punc(split[0]), split[1]] ) # pairs.append( [util.normalize_no_punc(split[0]), split[2]] ) # reverse pairs if needed, make lang instances if reverse: pairs = [list(reversed(p)) for p in pairs] source_lang = Lang() target_lang = Lang() else: source_lang = Lang() target_lang = Lang() return source_lang, target_lang, pairs
def read_langs(lang1, lang2, reverse=False): """Read sentences and instantiate Lang objects Args: lang1 (str): name of the first language lang2 (str): name of the second language reverse (bool, optional): reverse language orders? Defaults to False. Returns: (Lang, Lang, list): input language, output language, sentence pairs """ print('Reading the lines...') # read the file and split into lines lines = open(f'data/{lang1}-{lang2}.txt', encoding='utf-8').read().strip().split('\n') # create pairs (they are separated by a tab ('\t')) pairs = [[normalize_string(sentence) for sentence in line.split('\t')] for line in lines] # reverse pairs, make Lang instances if reverse: pairs = [list(reversed(pair)) for pair in pairs] input_lang = Lang(lang2) output_lang = Lang(lang1) else: input_lang = Lang(lang1) output_lang = Lang(lang2) return input_lang, output_lang, pairs
def __init__(self, config: Namespace, shuffle_at_init=False, seed=None): super(StandardDataset, self).__init__() self.config = config self.anno_lang = Lang("anno") self.code_lang = Lang("code") self.__preprocess(shuffle_at_init, seed)
def load_data(): lines = open('../assets/SMSSpamCollection.txt').readlines() pairs = [[normalize_string(s) for s in line.split('\t')] for line in lines] input_lang = Lang("txt") output_lang = Lang("label") for pair in pairs: input_lang.add_sentence(pair[1]) output_lang.add_sentence(pair[0]) return input_lang, output_lang, pairs
def prepare_data(source_lang, target_lang, reverse=False): filename = '%s-%s.txt' % (source_lang, target_lang) pairs = get_pair(os.path.join(data_root, filename), reverse) if reverse: source_lang_dict = Lang(target_lang) target_lang_dict = Lang(source_lang) else: source_lang_dict = Lang(source_lang) target_lang_dict = Lang(target_lang) for p in pairs: source_lang_dict.add_word_list(p[0]) target_lang_dict.add_word_list(p[1]) return pairs, source_lang_dict, target_lang_dict
def read_data(lang_1, lang_2, reverse=False): print("importing dataset") f = open("data/{}-{}.txt".format(lang_1, lang_2), "r", encoding="utf-8").read().strip() pairs = [[normalize_string(s) for s in l.split("\t")] for l in f.split("\n")] if reverse: pairs = [list(reversed(p)) for p in pairs] input_lang = Lang(lang_2) output_lang = Lang(lang_1) else: input_lang = Lang(lang_1) output_lang = Lang(lang_2) return input_lang, output_lang, pairs
def choose_dir(self, event): #选择并更新根目录 select_dir = wx.DirDialog(None, Lang().get('apache_choose_doc_root') + ':', self.cfg_doc_root.GetLabelText(), style=wx.DD_DEFAULT_STYLE | wx.DD_NEW_DIR_BUTTON) if select_dir.ShowModal() == wx.ID_OK and os.path.isdir(select_dir.GetPath()): self.cfg_doc_root.SetLabelText(select_dir.GetPath()) select_dir.Destroy()
def make_lang(train_x,dev_x,test_x): lang = Lang('essays') train_words = [] for essay in train_x: if params['padding_level']=='document': train_words += essay elif params['padding_level']=='sentence': for sent in essay: train_words += sent lang.build_vocab(train_words, voc_size=params['voc_size']) params['voc_size'] = lang.n_words lang.make_embeddings(emb_size=params['emb_size'], emb_type=params['emb_type']) return lang
def __init__(self, directory): self.word_lang = Lang("normal word") self.num_normal_word = -1 self.MAX_LENGTH = MAX_LENGTH self.MAX_MEM_SIZE = MAX_MEM_SIZE with open(directory, "rb") as pickle_d: self.raw_data = pickle.load(pickle_d) self.prepare_lang()
def decoder_load(file_path): attr_dict = torch.load(file_path) hidden_size = attr_dict['hidden_size'] lang = Lang(attr_dict['lang']) dropout_p = attr_dict['dropout_p'] max_length = attr_dict['max_length'] decoder = AttnDecoderRNN(hidden_size, lang, dropout_p, max_length) decoder.loadAttributes(attr_dict) return decoder
def build_char_lang(): lang = Lang() lang.word2index = dict() lang.index2word = dict() lang.n_words = 0 chars = "!\"$%&'()*+,-./0123456789:;<>?[]abcdefghijklmnopqrstuvwxyz" for c in chars: lang.addWord(c) return lang
def readLangs(lang1, lang2, df, reverse=False): """ 言語モデルと文のペアの初期化 Args: lang1 ([str]): [元になるコーパス名] lang2 ([str]): [生成ターゲットのコーパス名] df ([pandas.DataFrame]): [df] reverse (bool, optional): [description]. Defaults to False. Returns: [(Lang, Lang, List[List[str]])]: [(src, target, 文のペア)] """ pairs = df.values.tolist() input_lang = Lang(lang1) output_lang = Lang(lang2) return input_lang, output_lang, pairs
def __init__(self, device, log_name): environ = os.environ self.device_id = environ.get(device) if (self.device_id == None): device_id = device self._device, self._easy_device, self.hierarchyviewer = self._connect_device( self.device_id) self._logger = createlogger(log_name) self._log_path = create_folder() self._config = GetConfigs("common") self._lang = Lang(device_id)
def read_langs(lang1, lang2, reverse=False): """ reverse == False, input [tap] output pair. reverse == True, output [tap] input pair. """ print("Reading lines ...") file_path = 'data/{}-{}.txt'.format(lang1, lang2) #lines = open(file_path).readlines() pairs = [] for line in open(file_path): pair = [normalize(s) for s in line.strip().split('\t')] if reverse: pair = list(reversed(pair)) pairs.append(pair) if reverse: lang1, lang2 = lang2, lang1 return Lang(lang1), Lang(lang2), pairs
def readLangs(lang1, lang2, reverse=False): print('Reading lines ...') # Read the file and split into lines f = open('data/{}-{}.txt'.format(lang1, lang2), 'r', encoding='utf-8') lines = f.read().strip().split('\n') # Split every line into pairs and normalize pairs = [[normalizeString(p) for p in l.strip().split('\t')] for l in lines] # Reverse pairs, make Lang instances if reverse: pairs = [list(reversed(p)) for p in pairs] input_lang = Lang(lang2) output_lang = Lang(lang1) else: input_lang = Lang(lang1) output_lang = Lang(lang2) return input_lang, output_lang, pairs
def read_langs(lang1, lang2, reverse=False): print "Reading lines..." # Read the file and split into lines lines = codecs.open('data/%s-%s.txt' % (lang1, lang2), 'r', 'utf-8').\ read().strip().split('\n') # Split every line into pairs and normalize pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines] # Reverse pairs, make Lang instances if reverse: pairs = [list(reversed(p)) for p in pairs] input_lang = Lang(lang2) output_lang = Lang(lang1) else: input_lang = Lang(lang1) output_lang = Lang(lang2) return input_lang, output_lang, pairs
def create_datasets(data_file, lang1_name, lang2_name, max_len=10, percentage=0.1): pairs = read_datafile(data_file) lang1_sents = [pair[0] for pair in pairs] lang2_sents = [pair[1] for pair in pairs] lang1 = Lang(lang1_name, lang1_sents) lang2 = Lang(lang2_name, lang2_sents) choose = [ 1 if random.random() > percentage else 0 for i in range(len(pairs)) ] train_idx = [i for i, k in enumerate(choose) if k == 1] test_idx = [i for i, k in enumerate(choose) if k == 0] train_pairs = [pair for i, pair in enumerate(pairs) if i in train_idx] test_pairs = [pair for i, pair in enumerate(pairs) if i in test_idx] train_dataset = SentencePairDataset(train_pairs, lang1, lang2, max_len) test_dataset = SentencePairDataset(test_pairs, lang1, lang2, max_len) return train_dataset, test_dataset
def __init__(self, word_vectors, max_length, char_embed=False, seeder=int(time.time())): super(PreTrainedEmbeddingEncoderBiRNN, self).__init__(word_vectors.vector_size, max_length, seeder=seeder) self.model_type = 'pre_trained_embedding' # define word vector embedding self.word_vectors = word_vectors # empty vector for oov self.empty_vector = Variable(torch.Tensor(self.empty_vector)).view( 1, 1, -1) # char embed self.char_embed = char_embed if self.char_embed: lang = Lang() lang.word2index = dict() lang.index2word = dict() lang.n_words = 0 chars = 'abcdefghijklmnopqrstuvwxyz0123456789' for c in chars: lang.addWord(c) self.charbased_model = WordEncoderBiRNN(self.hidden_size // 2, params.CHAR_LENGTH, lang, seeder=seeder) # word vector for start of string sos = torch.ones(self.hidden_size) self.sos_vector = Variable(sos).view(1, 1, -1) # word vector for end of string eos = torch.ones(self.hidden_size) * -1 self.eos_vector = Variable(eos).view(1, 1, -1) if params.USE_CUDA: self.cuda() self.empty_vector = self.empty_vector.cuda() self.sos_vector = self.sos_vector.cuda() self.eos_vector = self.eos_vector.cuda() self.cache_dict = dict() self.cache_dict[params.SOS_TOKEN] = self.sos_vector self.cache_dict[params.EOS_TOKEN] = self.eos_vector
def encoder_load(file_path, word_vector=None): attr_dict = torch.load(file_path) encoder_type = attr_dict['model_type'] encoder = None if encoder_type == "word_based": encoder = WordEncoderRNN(attr_dict['hidden_size'], Lang(attr_dict['lang'])) encoder.loadAttributes(attr_dict) elif encoder_type == "pre_trained_embedding": encoder = WordEncoderRNN(word_vector) encoder.loadAttributes(attr_dict) elif encoder_type == "word_vector_based": encoder = EmbeddingEncoderInputRNN(attr_dict['hidden_size']) encoder.loadAttributes(attr_dict) return encoder
def __init__(self): super(AutoTransferGUI,self).__init__() self.__consoleConfig = MainConfigUpdater("config.json") self.__tableName = "" self.__codeTextBoxXPath = "" self.__codeEnterBtnXPath = "" self.__transferBtnClassName = "" self.__transferBtnSelector = "" self.__checkBoxClassName = "" self.__fileTreeNodeClassName = "" self.__fileTreeDialogXPath = "" self.__destnationPath = "" self.__fileTreeConfirmClassName = "" self.__notFoundID = "" self.__langFloder = "." + os.sep + "lang" + os.sep self.__langList = [] self.__config = {} self.__curLang = "en" self.__getLangList() self.__loadConfig() self.__isTransferStarted = False self.__transferFramework = None self.__transferDBFile = "" self.__runMode = 0 self.__lang = Lang(self.__curLang) if (self.__lang.reload(self.__curLang) == -1): print ("Language Pack Error.") sys.exit(1) self.__title = self.__lang.get("title") self.__left = 50 self.__top = 50 self.__width = 500 self.__heigth = 600 self.__widgetList = [] self.__initUI() self.show()
def make_lang(texts): lang = Lang('texts') words = [] for text in texts: for sent in text: words += sent lang.build_vocab(words, voc_size=params['voc_size']) params['voc_size'] = lang.n_words lang.make_embeddings(emb_size=params['emb_size'], emb_type=params['emb_type']) return lang
def set_advt_frame(self, parent): self.setting_panel = wx.Panel(parent) self.setting_panel.SetSizer(self.setting_sizer) parent.AddPage(self.setting_panel, self.module_name) self.set_load_module() lbl_port = wx.StaticText(self.setting_panel, -1, Lang().get('apache_port')) self.cfg_port = wx.TextCtrl(self.setting_panel, -1, self.get_default_port(), size=(200, 20)) lbl_doc_root = wx.StaticText(self.setting_panel, -1, Lang().get('apache_doc_root')) self.cfg_doc_root = wx.TextCtrl(self.setting_panel, -1, self.get_doc_root(), size=(200, 20)) select_dir_btn = wx.Button(self.setting_panel, -1, Lang().get('apache_choose_doc_root')) select_dir_btn.Bind(wx.EVT_BUTTON, self.choose_dir) conf_btn = wx.Button(self.setting_panel, -1, Lang().get('apache_config_file')) conf_btn.Bind(wx.EVT_BUTTON, self.open_config_file) log_btn = wx.Button(self.setting_panel, -1, Lang().get('apache_log_file')) log_btn.Bind(wx.EVT_BUTTON, self.open_log_file) save_btn = wx.Button(self.setting_panel, -1, Lang().get('apache_save_config')) save_btn.Bind(wx.EVT_BUTTON, self.save_config) self.opt_sizer = wx.BoxSizer(wx.VERTICAL) self.grid_sizer = wx.FlexGridSizer(rows=5, cols=3) self.grid_sizer.AddMany([ (lbl_port, 0, wx.ALL, 5), (self.cfg_port, 0, wx.ALL, 5), (wx.StaticText(self.setting_panel)), (lbl_doc_root, 0, wx.ALL, 5), (self.cfg_doc_root, 0, wx.ALL, 5), (select_dir_btn) ]) self.handler_sizer = wx.BoxSizer(wx.HORIZONTAL) self.handler_sizer.AddMany([ (conf_btn), (log_btn), (save_btn) ]) self.opt_sizer.Add(self.grid_sizer) self.opt_sizer.Add(self.handler_sizer, 0, wx.TOP, 5) self.setting_sizer.Add(self.opt_sizer, 0, wx.ALL, 5)
def main(): logger = Logger() config = Config() lang = Lang(config.lang) players = [] for account in config.accounts: if account["disabled"]: logger.logger.info( lang.lang("main.auth.disabled").format(email=account["email"])) continue player = Player(account=account["email"], password=account["password"], server_address=config.server["ip"], port=config.server["port"], version=498, auto_reconnect=config.auto_reconnect, auto_respawn=config.auto_respawn, lang=lang) players.append(player) time.sleep(1) bot = ChatBot(players, lang) bot.start_listening()
def lang_repr(self): return Lang(self.lang).name
def main(): """ Executes the baseline model. This loads the training data, training labels, and dev data, then trains a logistic regression model, then dumps predictions to the specified file. Modify the middle of this code, between the two commented blocks, to create your own model. """ parser = argparse.ArgumentParser( description='Duolingo shared task baseline model') parser.add_argument('--language', default='en_es', help='choose from [es_en, en_es, fr_en]', required=False) parser.add_argument('--dataset_path', default='../data/%s/', required=False) parser.add_argument('--outputs_path', default='./outputs/', required=False) args = parser.parse_args() dataset_path = args.dataset_path % args.language assert os.path.isdir(dataset_path) train_path = dataset_path + '%s.slam.20190204.train' % args.language dev_path = dataset_path + '%s.slam.20190204.dev' % args.language test_path = dataset_path + '%s.slam.20190204.test' % args.language assert os.path.isfile(train_path) assert os.path.isfile(dev_path) assert os.path.isfile(test_path) if not os.path.isdir(args.outputs_path): os.mkdir(args.outputs_path) # ============================== Hyper Parameter ============================== dbg = False from_path = None # from_path = './saved_model/seq2seq_nomlp_20' # from_path = './saved_model/seq2seq_exp_20' # from_path = './saved_model/attention_v2_20' # from_path = './saved_model/cnn_3' # from_path = './saved_model/seq2seq_c_7' # from_path = './saved_model/transformer_1' epochs = 10 if dbg else 10 lang = Lang() # ============================== Data Loading ============================== print('Begin Data Loading') start_time = time.time() training_data, training_labels = load_data(train_path, lang, dbg=dbg, use_all_features=True) dev_data = load_data(dev_path, lang, use_all_features=True) test_data = load_data(test_path, lang, use_all_features=True) users = list( get_users(training_data).union(get_users(dev_data)).union( get_users(test_data))) lang.addUsers(users) for i in range(len(lang.letters)): lang.letter2Index[lang.letters[i]] = i end_time = time.time() print('Data Loaded\t Time Taken %0.2fm' % ((end_time - start_time) / 60)) model = Model(lang) # ============================== Training ============================== if from_path == None: print('Begin Training') train_loader = get_dataloader(training_data, lang, training_labels) model.train(train_loader, epochs) # ============================== Inference ============================== # print('Begin Inference-Dev', end=' ') # start_time = time.time() # dev_loader = get_dataloader(dev_data, lang) # predictions = model.predict_for_set(dev_loader, from_path) # with open(args.outputs_path + '%s_dev_predictions.pred' % args.language, 'wt') as f: # for instance_id, prediction in iteritems(predictions): # f.write(instance_id + ' ' + str(prediction) + '\n') # end_time = time.time() # print('| %0.2fm' % ((end_time-start_time)/60)) print('Begin Inference-Test', end=' ') start_time = time.time() test_loader = get_dataloader(test_data, lang) predictions = model.predict_for_set(test_loader, from_path) with open(args.outputs_path + '%s_test_predictions.pred' % args.language, 'wt') as f: for instance_id, prediction in predictions.items(): f.write(instance_id + ' ' + str(prediction) + '\n') end_time = time.time() print('| %0.2fm' % ((end_time - start_time) / 60))
if len(split) >= max_len: split = split[:max_len - 1] return ' '.join(split) # Config variables encoder_file = 'model/chatbot/augmented_data/word2vec/skipgram/twitter_sgram/encoder-d100-e5.pt' decoder_file = 'model/chatbot/augmented_data/word2vec/skipgram/twitter_sgram/decoder-d100-e5.pt' encoder_attr_dict = torch.load(encoder_file) decoder_attr_dict = torch.load(decoder_file) # Dataset (for build dictionary) # src_lang, tgt_lang, pairs = prepareData('dataset/input-output.txt', reverse=False) # Lang decoder_lang = Lang() decoder_lang.load_dict(decoder_attr_dict['lang']) # Word vector # word_vector = KeyedVectors.load_word2vec_format("word_vector/koran.vec", binary=True) word_vectors = KeyedVectors.load(params.WORD_VECTORS_FILE) # Params use_cuda = params.USE_CUDA hidden_size = word_vectors.vector_size # Encoder & Decoder # encoder = EncoderEmbeddingRNN(src_lang.n_words, hidden_size, word_vector) # attn_decoder = AttnDecoderRNN(hidden_size, tgt_lang.n_words, dropout_p=0.1, max_length=max_len) # encoder.loadState(ENCODER_MODEL) # attn_decoder.loadState(DECODER_MODEL)
controllers.ajax.app, '/login', controllers.login.app, '/logout', 'logout', '/lang/(.*)', 'switchLang', '', controllers.index.app) app = web.application(urls, globals()) # Save session to database or to disk if environ.get('DATABASE_URL'): db = web.database() store = web.session.DBStore(db, 'sessions') session = web.session.Session(app, store, initializer=GLOBALS['defaults']) else: session = web.session.Session(app, web.session.DiskStore('sessions'), initializer=GLOBALS['defaults']) lang = Lang(app, session, pwd) render = web.template.render(pwd + '/templates/', base='_layout', globals=GLOBALS) prender = web.template.render(pwd + '/templates/', globals=GLOBALS) def debug(x): return '<pre class="debug">' + pprint.pformat(x, indent=4) \ .replace('\\n', '\n') \ .replace('&', '&') \ .replace("<", "<") \ .replace(">", ">") \ .replace('"', '"') + '</pre>'
from flask_migrate import Migrate from flask_login import LoginManager from flask_mail import Mail from flask_bootstrap import Bootstrap from flask import request app = Flask(__name__, static_url_path='/static') app.config.from_object(Config) db = SQLAlchemy(app) migrate = Migrate(app, db) bootstrap = Bootstrap(app) mail = Mail(app) moment = Moment(app) login = LoginManager(app) login.login_view = 'login' login.login_message = Lang().get_str('login_message') babel = Babel(app) from app import routes, models, errors if not app.debug: if app.config['MAIL_SERVER']: auth = None if app.config['MAIL_USERNAME'] or app.config['MAIL_PASSWORD']: auth = (app.config['MAIL_USERNAME'], app.config['MAIL_PASSWORD']) secure = None if app.config['MAIL_USE_TLS']: secure = () mail_handler = SMTPHandler( mailhost=(app.config['MAIL_SERVER'], app.config['MAIL_PORT']), fromaddr='no-reply@' + app.config['MAIL_SERVER'],