def __init__(self): print(os.path.join(self.SETTINGS_DIR, 'corpus.json')) f = open(os.path.join(self.SETTINGS_DIR, 'corpus.json'), 'r', encoding='utf-8') self.settings = json.loads(f.read()) f.close() self.name = self.settings['corpus_name'] self.languages = self.settings['languages'] if len(self.languages) <= 0: self.languages = [self.name] self.input_format = self.settings['input_format'] self.corpus_dir = os.path.join('../corpus', self.name) self.iterSent = None if self.input_format in ['json', 'json-gzip']: self.iterSent = JSONDocReader(format=self.input_format) self.goodWordFields = ['lex', 'wf', 'wf_display', 'parts', 'gloss', 'gloss_index', 'n_ana', 'trans_en', 'trans_ru'] self.AdditionalWordFields = set() if 'word_fields' in self.settings: self.AdditionalWordFields |= set(self.settings['word_fields']) if 'word_table_fields' in self.settings: self.AdditionalWordFields |= set(self.settings['word_table_fields']) if 'accidental_word_fields' in self.settings: self.AdditionalWordFields -= set(self.settings['accidental_word_fields']) f = open(os.path.join(self.SETTINGS_DIR, 'categories.json'), 'r', encoding='utf-8') categories = json.loads(f.read()) self.goodWordFields += ['gr.' + v for lang in categories for v in categories[lang].values()] self.goodWordFields = set(self.goodWordFields) f.close() self.pd = PrepareData() self.es = Elasticsearch() self.es_ic = IndicesClient(self.es) self.shuffled_ids = [i for i in range(1, 1000000)] random.shuffle(self.shuffled_ids) self.shuffled_ids.insert(0, 0) # id=0 is special and should not change self.tmpWordIDs = [{} for i in range(len(self.languages))] # word as JSON -> its integer ID self.tmpLemmaIDs = [{} for i in range(len(self.languages))] # lemma as string -> its integer ID self.word2lemma = [{} for i in range(len(self.languages))] # word's ID -> ID of its lemma (or -1, if none) self.wordFreqs = [{} for i in range(len(self.languages))] # word's ID -> its frequency self.wordSFreqs = [{} for i in range(len(self.languages))] # word's ID -> its number of sentences self.wordDocFreqs = [{} for i in range(len(self.languages))] # (word's ID, dID) -> word frequency in the document # self.wordSIDs = [{} for i in range(len(self.languages))] # word's ID -> set of sentence IDs self.wordDIDs = [{} for i in range(len(self.languages))] # word's ID -> set of document IDs self.wfs = set() # set of word forms (for sorting) self.lemmata = set() # set of lemmata (for sorting) self.sID = 0 # current sentence ID for each language self.dID = 0 # current document ID self.wID = 0 # current word ID self.wordFreqID = 0 self.numWords = 0 # number of words in current document self.numSents = 0 # number of sentences in current document self.numWordsLang = [0] * len(self.languages) # number of words in each language in current document self.numSentsLang = [0] * len(self.languages) # number of sentences in each language in current document self.totalNumWords = 0
def init(): prepare_pku = PrepareData(constant.VOCAB_SIZE, 'corpus/pku_training.utf8', 'corpus/pku_training_words.txt', 'corpus/pku_training_labels.txt', 'corpus/pku_training_dict.txt') prepare_pku.build_exec() dict_name = 'corpus/pku_training_dict.txt' copyfile(dict_name, 'corpus/dict.utf8') trans_dnn = TransformDataDNN(constant.DNN_SKIP_WINDOW, True) trans_dnn.generate_exe()
def __init__(self, config, paths, vocab): self.vocab = vocab if isinstance(paths, str): paths = [paths] prepare_data = PrepareData(min_count=config.MIN_COUNT, max_length=config.MAX_LENGTH) parsed_pairs = sum( [prepare_data.read_pairs(datafile) for datafile in paths], []) self.data = CornellMovieDialogDataset.make_dataset(parsed_pairs, vocab)
def parse_args(self): arg_parser = argparse.ArgumentParser() arg_parser.add_argument( '-n', '--network', dest='network_filename', type=str, required=True, help='Name of the input hdf5 file with the neural network' ) arg_parser.add_argument( '-st', '--stats', dest='stats', nargs='?', const=True, required=False, help='Show accuracy stats', default=False ) arg_parser.add_argument( '-r', '--print-results', dest='print_results', nargs='?', const=True, required=False, help='Show every board state', default=False ) arg_parser.add_argument( '--disable-cuda', dest='disable_cuda', nargs='?', const=True, required=False, help='Add this flag to use the CPU instead of the GPU', default=False ) arg_parser.add_argument( '-l', '--log', dest='log_to_file', nargs='?', const=True, required=False, default=False, help='Add this flag if board states should be stored in a file' ' (that can be played back later with a visualizer)' ) self.args = arg_parser.parse_args() if 'ds2048_1' in self.args.network_filename: self.preprocessing_method = PrepareData.pre_process1 else: self.preprocessing_method = PrepareData.pre_process2 self.input_vector_size = PrepareData.get_vector_size(self.preprocessing_method) self.args = arg_parser.parse_args() self.network_filename = self.args.network_filename
def predict_sentiment(): input_json = request.get_json() input_review = json.loads(input_json["review"]) prep = PrepareData(input_fields=config.INPUT_FIELDS, text_field=config.TEXT_FIELD, target_field=config.TARGET_FIELD, maxlen=config.MAX_LEN, padding="pre", vocab_size=config.VOCAB_SIZE) x = prep.run_prep(input_review) predicted_prob = _infer( model_input={"inputs": { "text_input": x.values.tolist() }})["outputs"] predicted_prob = np.array(predicted_prob) sentiments = np.where(predicted_prob > 0.5, 1, 0) return jsonify(sentiments.tolist())
def add_aux_corpus(self): src_rare_vocab = rare_vocab_create(self.src_vocab) trg_rare_vocab = rare_vocab_create(self.trg_vocab) #rare_vocabの単語を含む対訳を抽出 aux_taiyaku = [] for src in src_rare_vocab: aux_taiyaku.append(scraping(src, "en")) for trg in trg_rare_vocab: aux_taiyaku.append(scraping(trg, "ja")) aux_corpus = './../data/aux_taiyaku.tsv' f = open(aux_corpus, 'w') for s in aux_taiyaku: f.write(s+'\n') f.close() PrepareData.extract_each_sentence(aux_corpus, './../data/ja_aux_sentences.tsv', 'ja') PrepareData.extract_each_sentence(aux_corpus, './../data/en_aux_sentences.tsv', 'en') #aux_corpusについてもvocabを作成 aux_src_vocab = Tokenizer.en_vocab_create('./../data/en_aux_sentences.tsv') aux_trg_vocab = Tokenizer.ja_vocab_create('./../data/ja_aux_sentences.tsv') #vocabにない、かつaux_vocabにおいてもfreq50未満の単語 が含まれている文をaux_corpusから削除 no_use_src_vocab = rare_vocab_create(aux_src_vocab) no_use_trg_vocab = rare_vocab_create(aux_trg_vocab) for i in range(len(no_use_src_vocab)): if no_use_src_vocab[i] in self.src_vocab.keys(): del no_use_src_vocab[i] for i in range(len(no_use_trg_vocab)): if no_use_trg_vocab[i] in self.trg_vocab.keys(): del no_use_trg_vocab[i] no_use_vocab = no_use_src_vocab + no_use_trg_vocab for j in range(len(aux_taiyaku)): for word in no_use_vocab: if word in aux_taiyaku[j]: del aux_taiyaku[j] break f = open(aux_corpus, 'w') for s in aux_taiyaku: f.write(s+'\n') f.close() #トレーニングデータにaux_corpusを追加 PrepareData.extract_each_sentence(aux_corpus, './../data/ja_sentences.tsv', 'ja') PrepareData.extract_each_sentence(aux_corpus, './../data/en_sentences.tsv', 'en')
def parse_args(self): arg_parser = argparse.ArgumentParser() arg_parser.add_argument( '-n', '--network', dest='network_filename', type=str, required=True, help='Name of the input hdf5 file with the neural network') arg_parser.add_argument('-st', '--stats', dest='stats', nargs='?', const=True, required=False, help='Show accuracy stats', default=False) arg_parser.add_argument('-r', '--print-results', dest='print_results', nargs='?', const=True, required=False, help='Show every board state', default=False) arg_parser.add_argument( '--disable-cuda', dest='disable_cuda', nargs='?', const=True, required=False, help='Add this flag to use the CPU instead of the GPU', default=False) arg_parser.add_argument( '-l', '--log', dest='log_to_file', nargs='?', const=True, required=False, default=False, help='Add this flag if board states should be stored in a file' ' (that can be played back later with a visualizer)') self.args = arg_parser.parse_args() if 'ds2048_1' in self.args.network_filename: self.preprocessing_method = PrepareData.pre_process1 else: self.preprocessing_method = PrepareData.pre_process2 self.input_vector_size = PrepareData.get_vector_size( self.preprocessing_method) self.args = arg_parser.parse_args() self.network_filename = self.args.network_filename
def train(): # prepare data for sentencepiece training PrepareData.extract_each_sentence('./../data/taiyaku.tsv', './../data/ja_sentences.tsv', 'ja') PrepareData.extract_each_sentence('./../data/taiyaku.tsv', './../data/en_sentences.tsv', 'en') # train sentencepiece model Tokenizer.train_sentencepiece(32000, 'ja') #data augment da = DataAugmentation() da.add_aux_corpus() # data filtering process fl = Filter('./../data/taiyaku.tsv') fl.one_multi_filter() print('1st done') fl.src_equal_trg_filter() print('second done') fl.non_alphabet_filter(0.5) print('') fl.correct_lang_filter() fl.update_df() # prepare data for feeding to a model to make PrepareData.prepare_tokenized_taiyaku('./../data/tokenized_taiyaku.tsv') t2t = PyT2T() # data generate t2t.data_gen() # train phase t2t.train()
def main(): # 数据预处理 data = PrepareData() args.src_vocab = len(data.en_word_dict) args.tgt_vocab = len(data.cn_word_dict) print("src_vocab %d" % args.src_vocab) print("tgt_vocab %d" % args.tgt_vocab) # 初始化模型 model = make_model( args.src_vocab, args.tgt_vocab, args.layers, args.d_model, args.d_ff, args.h_num, args.dropout ) if args.type == 'train': # 训练 print(">>>>>>> start train") criterion = LabelSmoothing(args.tgt_vocab, padding_idx = 0, smoothing= 0.0) optimizer = NoamOpt(args.d_model, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9,0.98), eps=1e-9)) train(data, model, criterion, optimizer) print("<<<<<<< finished train") elif args.type == "evaluate": # 预测 # 先判断模型有没有训练好(前提) if os.path.exists(args.save_file): # 加载模型 model.load_state_dict(torch.load(args.save_file)) # 开始预测 print(">>>>>>> start evaluate") evaluate(data, model) print("<<<<<<< finished evaluate") else: print("Error: pleas train before evaluate") else: print("Error: please select type within [train / evaluate]")
def evaluate_model(cws, model): pre = PrepareData(4000, 'pku', dict_path='corpus/pku_dict.utf8', type=CorpusType.Test) sentences = pre.raw_lines labels = pre.labels_index corr_count = 0 re_count = 0 total_count = 0 for _, (sentence, label) in enumerate(zip(sentences, labels)): _, tag = cws.seg(sentence, model) cor_count, prec_count, recall_count = estimate_cws( tag, np.array(label)) corr_count += cor_count re_count += recall_count total_count += prec_count prec = corr_count / total_count recall = corr_count / re_count print(prec) print(recall) print(2 * prec * recall / (prec + recall))
import numpy as np import pandas as pd from pprint import pprint from prepare_data import PrepareData from sklearn.ensemble import RandomForestClassifier prepared_data = PrepareData() labels = prepared_data.tag raw_data = prepared_data.raw_data training_data = raw_data[1:] index_data = raw_data[0] print(np.array(labels).shape) print(np.array(training_data).shape) # forest = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1, oob_score=True) # forest.fit(training_data, labels) # importances = forest.feature_importances_ # indices = np.argsort(importances)[::-1] # print(forest.oob_score_) # for f in range(len(index_data)): # print("%2d) %-*s %f" % (indices[f], 30, index_data[f], importances[indices[f]])) import matplotlib.pyplot as plt from sklearn import datasets, svm from sklearn.feature_selection import SelectPercentile, f_classif, SelectKBest, chi2 ###############################################################################
#!/usr/bin/env python # coding: utf-8 from prepare_data import PrepareData from model import LSTMClassifier import pandas as pd from sklearn.model_selection import train_test_split ppd = PrepareData() data = ppd.get_data() lstm = LSTMClassifier() X = lstm.get_matrix(data) Y = pd.get_dummies(data['label']).values X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42) print(X_train.shape, Y_train.shape) print(X_test.shape, Y_test.shape) model = lstm.get_model(X.shape[1]) history = lstm.fit_model(model, X_train, Y_train) validation_size = 1500 X_validate = X_test[-validation_size:] Y_validate = Y_test[-validation_size:] X_test = X_test[:-validation_size] Y_test = Y_test[:-validation_size] score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size)
def __init__(self, overwrite=False): self.overwrite = overwrite # whether to overwrite an existing index without asking with open(os.path.join(self.SETTINGS_DIR, 'corpus.json'), 'r', encoding='utf-8') as fSettings: self.settings = json.load(fSettings) self.j2h = JSON2HTML(settings=self.settings) self.name = self.settings['corpus_name'] self.languages = self.settings['languages'] if len(self.languages) <= 0: self.languages = [self.name] self.input_format = self.settings['input_format'] self.corpus_dir = os.path.join('../corpus', self.name) self.iterSent = None if self.input_format in ['json', 'json-gzip']: self.iterSent = JSONDocReader(format=self.input_format, settings=self.settings) # Make sure only commonly used word fields and those listed # in corpus.json get into the words index. self.goodWordFields = [ 'lex', # lemma 'wf', # word form (for search) 'wf_display', # word form (for display; optional) 'parts', # morpheme breaks in the word form 'gloss', # glosses (for display) 'gloss_index', # glosses (for search) 'n_ana' # number of analyses ] self.additionalWordFields = set() if 'word_fields' in self.settings: self.additionalWordFields |= set(self.settings['word_fields']) if 'word_table_fields' in self.settings: self.additionalWordFields |= set( self.settings['word_table_fields']) if 'accidental_word_fields' in self.settings: self.additionalWordFields -= set( self.settings['accidental_word_fields']) f = open(os.path.join(self.SETTINGS_DIR, 'categories.json'), 'r', encoding='utf-8') categories = json.loads(f.read()) f.close() self.goodWordFields += [ 'gr.' + v for lang in categories for v in categories[lang].values() ] self.goodWordFields = set(self.goodWordFields) self.characterRegexes = {} self.pd = PrepareData() self.es = Elasticsearch() self.es_ic = IndicesClient(self.es) self.shuffled_ids = [i for i in range(1, 1000000)] random.shuffle(self.shuffled_ids) self.shuffled_ids.insert(0, 0) # id=0 is special and should not change self.tmpWordIDs = [{} for i in range(len(self.languages)) ] # word as JSON -> its integer ID self.tmpLemmaIDs = [{} for i in range(len(self.languages)) ] # lemma as string -> its integer ID # Apart from the two dictionaries above, words and lemmata # have string IDs starting with 'w' or 'l' followed by an integer self.word2lemma = [ {} for i in range(len(self.languages)) ] # word/lemma ID -> ID of its lemma (or -1, if none) self.wordFreqs = [{} for i in range(len(self.languages)) ] # word/lemma ID -> its frequency self.wordSFreqs = [{} for i in range(len(self.languages)) ] # word/lemma ID -> its number of sentences self.wordDocFreqs = [ {} for i in range(len(self.languages)) ] # (word/lemma ID, dID) -> word frequency in the document # self.wordSIDs = [{} for i in range(len(self.languages))] # word's ID -> set of sentence IDs self.wordDIDs = [{} for i in range(len(self.languages)) ] # word/lemma ID -> set of document IDs self.wfs = set() # set of word forms (for sorting) self.lemmata = set() # set of lemmata (for sorting) self.sID = 0 # current sentence ID for each language self.dID = 0 # current document ID self.wID = 0 # current word ID self.wordFreqID = 0 # current word_freq ID for word/document frequencies self.lemmaFreqID = 0 # current word_freq ID for lemma/document frequencies self.numWords = 0 # number of words in current document self.numSents = 0 # number of sentences in current document self.numWordsLang = [0] * len( self.languages ) # number of words in each language in current document self.numSentsLang = [0] * len( self.languages ) # number of sentences in each language in current document self.totalNumWords = 0
class Indexator: """ Contains methods for loading the JSON documents in the corpus database. """ SETTINGS_DIR = '../conf' rxBadFileName = re.compile('[^\\w_.-]*', flags=re.DOTALL) def __init__(self, overwrite=False): self.overwrite = overwrite # whether to overwrite an existing index without asking with open(os.path.join(self.SETTINGS_DIR, 'corpus.json'), 'r', encoding='utf-8') as fSettings: self.settings = json.load(fSettings) self.j2h = JSON2HTML(settings=self.settings) self.name = self.settings['corpus_name'] self.languages = self.settings['languages'] if len(self.languages) <= 0: self.languages = [self.name] self.input_format = self.settings['input_format'] self.corpus_dir = os.path.join('../corpus', self.name) self.iterSent = None if self.input_format in ['json', 'json-gzip']: self.iterSent = JSONDocReader(format=self.input_format, settings=self.settings) # Make sure only commonly used word fields and those listed # in corpus.json get into the words index. self.goodWordFields = [ 'lex', # lemma 'wf', # word form (for search) 'wf_display', # word form (for display; optional) 'parts', # morpheme breaks in the word form 'gloss', # glosses (for display) 'gloss_index', # glosses (for search) 'n_ana' # number of analyses ] self.additionalWordFields = set() if 'word_fields' in self.settings: self.additionalWordFields |= set(self.settings['word_fields']) if 'word_table_fields' in self.settings: self.additionalWordFields |= set( self.settings['word_table_fields']) if 'accidental_word_fields' in self.settings: self.additionalWordFields -= set( self.settings['accidental_word_fields']) f = open(os.path.join(self.SETTINGS_DIR, 'categories.json'), 'r', encoding='utf-8') categories = json.loads(f.read()) f.close() self.goodWordFields += [ 'gr.' + v for lang in categories for v in categories[lang].values() ] self.goodWordFields = set(self.goodWordFields) self.characterRegexes = {} self.pd = PrepareData() self.es = Elasticsearch() self.es_ic = IndicesClient(self.es) self.shuffled_ids = [i for i in range(1, 1000000)] random.shuffle(self.shuffled_ids) self.shuffled_ids.insert(0, 0) # id=0 is special and should not change self.tmpWordIDs = [{} for i in range(len(self.languages)) ] # word as JSON -> its integer ID self.tmpLemmaIDs = [{} for i in range(len(self.languages)) ] # lemma as string -> its integer ID # Apart from the two dictionaries above, words and lemmata # have string IDs starting with 'w' or 'l' followed by an integer self.word2lemma = [ {} for i in range(len(self.languages)) ] # word/lemma ID -> ID of its lemma (or -1, if none) self.wordFreqs = [{} for i in range(len(self.languages)) ] # word/lemma ID -> its frequency self.wordSFreqs = [{} for i in range(len(self.languages)) ] # word/lemma ID -> its number of sentences self.wordDocFreqs = [ {} for i in range(len(self.languages)) ] # (word/lemma ID, dID) -> word frequency in the document # self.wordSIDs = [{} for i in range(len(self.languages))] # word's ID -> set of sentence IDs self.wordDIDs = [{} for i in range(len(self.languages)) ] # word/lemma ID -> set of document IDs self.wfs = set() # set of word forms (for sorting) self.lemmata = set() # set of lemmata (for sorting) self.sID = 0 # current sentence ID for each language self.dID = 0 # current document ID self.wID = 0 # current word ID self.wordFreqID = 0 # current word_freq ID for word/document frequencies self.lemmaFreqID = 0 # current word_freq ID for lemma/document frequencies self.numWords = 0 # number of words in current document self.numSents = 0 # number of sentences in current document self.numWordsLang = [0] * len( self.languages ) # number of words in each language in current document self.numSentsLang = [0] * len( self.languages ) # number of sentences in each language in current document self.totalNumWords = 0 def delete_indices(self): """ If there already exist indices with the same names, ask the user if they want to overwrite them. If they say yes, remove the indices and return True. Otherwise, return False. """ if not self.overwrite: if (self.es_ic.exists(index=self.name + '.docs') or self.es_ic.exists(index=self.name + '.words') or self.es_ic.exists(index=self.name + '.sentences')): print('It seems that a corpus named "' + self.name + '" already exists. ' + 'Do you want to overwrite it? [y/n]') reply = input() if reply.lower() != 'y': print('Indexation aborted.') return False if self.es_ic.exists(index=self.name + '.docs'): self.es_ic.delete(index=self.name + '.docs') if self.es_ic.exists(index=self.name + '.words'): self.es_ic.delete(index=self.name + '.words') if self.es_ic.exists(index=self.name + '.sentences'): self.es_ic.delete(index=self.name + '.sentences') # Obsolete index word_freq can be present in pre-2019 corpora if self.es_ic.exists(index=self.name + '.word_freqs'): self.es_ic.delete(index=self.name + '.word_freqs') return True def create_indices(self): """ Create empty elasticsearch indices for corpus data, using mappings provided by PrepareData. """ self.sentWordMapping = self.pd.generate_words_mapping(wordFreqs=False) self.wordMapping = self.pd.generate_words_mapping(wordFreqs=True) self.sentMapping = self.pd.generate_sentences_mapping( self.sentWordMapping) self.docMapping = self.pd.generate_docs_mapping() self.es_ic.create(index=self.name + '.docs', body=self.docMapping) self.es_ic.create(index=self.name + '.words', body=self.wordMapping) self.es_ic.create(index=self.name + '.sentences', body=self.sentMapping) def randomize_id(self, realID): """ Return a (relatively) randomized sentence ID. This randomization is needed in context-aware word queries where the sentences are iterated in the order determined by their IDs. """ if realID < 0: return realID idStart, idEnd = realID // 1000000, realID % 1000000 return idStart * 1000000 + self.shuffled_ids[idEnd] def enhance_word(self, word): """ Add some calculated fields to the JSON word. """ if 'ana' not in word: word['n_ana'] = 0 else: word['n_ana'] = len(word['ana']) # n_ana is a (signed) byte, so a word can have at most 127 analyses if word['n_ana'] >= 127: word['n_ana'] = 127 def clean_word(self, w, langID): """ Clean a word object by removing unnecessary fields, lowercasing things if needed, etc. Return the cleaned object and the lemma. Add word form and lemma to the global lists. """ wClean = {'lang': langID} lemma = '' for field in w: if field in self.goodWordFields or field in self.additionalWordFields: wClean[field] = w[field] if field == 'wf': if 'wf_lowercase' not in self.settings or self.settings[ 'wf_lowercase']: wClean[field] = wClean[field].lower() self.wfs.add(wClean[field]) if 'ana' in w: lemma = self.get_lemma(w) self.lemmata.add(lemma) wClean['ana'] = [] for ana in w['ana']: cleanAna = {} for anaField in ana: if anaField in self.goodWordFields or anaField in self.additionalWordFields: cleanAna[anaField] = ana[anaField] wClean['ana'].append(cleanAna) return wClean, lemma def process_sentence_words(self, words, langID): """ Take words from a sentence, remove all non-searchable fields from them and add them to self.words dictionary. Add w_id and l_id properties to each word of the words list. Return the value of the 'sent_analyzed' meta field. """ sIDAdded = set( ) # word IDs for which the current settence ID has been counted bFullyAnalyzed = True # Whether each word in the sentence is analyzed bUniquelyAnalyzed = True # Whether, in addition, each word has exactly one analysis for w in words: if w['wtype'] != 'word': continue self.numWords += 1 self.numWordsLang[langID] += 1 self.totalNumWords += 1 self.enhance_word(w) if 'ana' not in w or len(w['ana']) <= 0: bFullyAnalyzed = False bUniquelyAnalyzed = False elif len(w['ana']) > 1: bUniquelyAnalyzed = False wClean, lemma = self.clean_word(w, langID) wCleanTxt = json.dumps(wClean, ensure_ascii=False, sort_keys=True) if wCleanTxt in self.tmpWordIDs[langID]: wID = self.tmpWordIDs[langID][wCleanTxt] else: wID = sum( len(self.tmpWordIDs[i]) for i in range(len(self.languages))) self.tmpWordIDs[langID][wCleanTxt] = wID wID = 'w' + str(wID) w['w_id'] = wID lID = 'l0' # Default: no analysis if len(lemma) > 0: try: lID = self.tmpLemmaIDs[langID][lemma] except KeyError: lID = sum( len(self.tmpLemmaIDs[i]) for i in range(len(self.languages))) + 1 self.tmpLemmaIDs[langID][lemma] = lID lID = 'l' + str(lID) self.word2lemma[langID][wID] = lID w['l_id'] = lID for itemID in [wID, lID]: try: self.wordFreqs[langID][itemID] += 1 except KeyError: self.wordFreqs[langID][itemID] = 1 if itemID not in sIDAdded: sIDAdded.add(itemID) try: self.wordSFreqs[langID][itemID] += 1 except KeyError: self.wordSFreqs[langID][itemID] = 1 try: self.wordDIDs[langID][itemID].add(self.dID) except KeyError: self.wordDIDs[langID][itemID] = {self.dID} try: self.wordDocFreqs[langID][(itemID, self.dID)] += 1 except KeyError: self.wordDocFreqs[langID][(itemID, self.dID)] = 1 if not bFullyAnalyzed: return 'incomplete' if not bUniquelyAnalyzed: return 'complete' return 'unique' def character_regex(self, lang): """ Regex for splitting text into characters. Takes into account multicharacter sequences (digraphs etc.) defined in lang_props.lexicographic_order. """ if lang in self.characterRegexes: return self.characterRegexes[lang] # cache if lang not in self.settings[ 'lang_props'] or 'lexicographic_order' not in self.settings[ 'lang_props'][lang]: self.characterRegexes[lang] = re.compile('.') return self.characterRegexes[lang] rxChars = '(' + '|'.join( re.escape(c.lower()) for c in sorted( self.settings['lang_props'][lang]['lexicographic_order'], key=lambda x: (-len(x), x)) if len(c) > 1) if len(rxChars) > 1: rxChars += '|' rxChars += '.)' rxChars = re.compile(rxChars) self.characterRegexes[lang] = rxChars return rxChars def make_sorting_function(self, lang): """ Return a function that can be used for sorting tokens in a list according to the alphabetical ordering specified for the language lang. """ sortingFunction = lambda x: x if lang in self.settings[ 'lang_props'] and 'lexicographic_order' in self.settings[ 'lang_props'][lang]: dictSort = { self.settings['lang_props'][lang]['lexicographic_order'][i]: (i, self.settings['lang_props'][lang]['lexicographic_order'][i]) for i in range( len(self.settings['lang_props'][lang] ['lexicographic_order'])) } maxIndex = len(dictSort) rxChars = self.character_regex(lang) def charReplaceFunction(c): if c in dictSort: return dictSort[c] return (maxIndex, c) sortingFunction = lambda x: [ charReplaceFunction(c) for c in rxChars.findall(x.lower()) ] return sortingFunction def sort_words(self, lang): """ Sort word forms and lemmata stored at earlier stages. Return dictionaries with positions of word forms and lemmata in the sorted list. If there is a custom alphabetical order for the language, use it. Otherwise, use standard lexicographic sorting. """ wfsSorted = {} iOrder = 0 sortingFunction = self.make_sorting_function(lang) for wf in sorted(self.wfs, key=sortingFunction): wfsSorted[wf] = iOrder iOrder += 1 lemmataSorted = {} iOrder = 0 for l in sorted(self.lemmata, key=sortingFunction): lemmataSorted[l] = iOrder iOrder += 1 return wfsSorted, lemmataSorted def get_freq_ranks(self, freqsSorted): """ Calculate frequency ranks and rank/quantile labels for words or lemmata. """ freqToRank = {} quantiles = {} prevFreq = 0 prevRank = 0 for i in range(len(freqsSorted)): v = freqsSorted[i] if v != prevFreq: if prevFreq != 0: freqToRank[prevFreq] = prevRank + (i - prevRank) // 2 prevRank = i prevFreq = v if prevFreq != 0: freqToRank[prevFreq] = prevRank + (len(freqsSorted) - prevRank) // 2 for q in [0.03, 0.04, 0.05, 0.1, 0.15, 0.2, 0.25, 0.5]: qIndex = math.ceil(q * len(freqsSorted)) if qIndex >= len(freqsSorted): qIndex = len(freqsSorted) - 1 if qIndex >= 0: quantiles[q] = freqsSorted[qIndex] else: quantiles[q] = 0 return freqToRank, quantiles def quantile_label(self, freq, rank, quantiles): """ Return a string label of the frequency rank (for frequent items) or quantile. This label is showed to the user in word query results. """ if freq > 1 and freq >= quantiles[0.5]: if freq > quantiles[0.03]: return '#' + str(rank + 1) else: return '> ' + str( min( math.ceil(q * 100) for q in quantiles if freq >= quantiles[q])) + '%' return '' def get_lemma(self, word, lower_lemma=True): """ Join all lemmata in the JSON representation of a word with an analysis and return them as a string. """ if 'ana' not in word: return '' if 'keep_lemma_order' not in self.settings or not self.settings[ 'keep_lemma_order']: curLemmata = set() for ana in word['ana']: if 'lex' in ana: if type(ana['lex']) == list: for l in ana['lex']: lAdd = l if lower_lemma: lAdd = lAdd.lower() curLemmata.add(lAdd) else: lAdd = ana['lex'] if lower_lemma: lAdd = lAdd.lower() curLemmata.add(lAdd) return '/'.join(l for l in sorted(curLemmata)) curLemmata = [] for ana in word['ana']: if 'lex' in ana: if type(ana['lex']) == list: for l in ana['lex']: lAdd = l if lower_lemma: lAdd = lAdd.lower() curLemmata.append(lAdd) else: lAdd = ana['lex'] if lower_lemma: lAdd = lAdd.lower() curLemmata.append(lAdd) return '/'.join(curLemmata) def get_grdic(self, word, lang): """ Join all dictionary grammar tags strings in the JSON representation of a word with an analysis and return them as a string. """ if 'ana' not in word: return '' curGramm = set() translations = set() for ana in word['ana']: grTags = '' if 'gr.pos' in ana: value = ana['gr.pos'] if type(value) == list: value = ', '.join(value) grTags = value for field in sorted(ana): value = ana[field] if type(value) == list: value = ', '.join(value) if ('lang_props' in self.settings and lang in self.settings['lang_props'] and 'dictionary_categories' in self.settings['lang_props'][lang] and field.startswith('gr.') and field[3:] in self.settings['lang_props'][lang] ['dictionary_categories']): if len(grTags) > 0: grTags += ', ' grTags += value elif field.startswith('trans_'): translations.add(value) if len(grTags) > 0: curGramm.add(grTags) return ' | '.join(grdic for grdic in sorted(curGramm)), ' | '.join( tr for tr in sorted(translations)) def iterate_lemmata(self, langID, lemmataSorted): """ Iterate over all lemmata for one language collected at the word iteration stage. """ lFreqsSorted = [ self.wordFreqs[langID][itemID] for itemID in self.wordFreqs[langID] if itemID.startswith('l') ] lFreqsSorted.sort(reverse=True) lemmaFreqToRank, quantiles = self.get_freq_ranks(lFreqsSorted) iLemma = 0 for l, lID in self.tmpLemmaIDs[langID].items(): lID = 'l' + str(lID) if iLemma % 250 == 0: print('indexing lemma', iLemma) lOrder = lemmataSorted[l] lemmaJson = { 'wf': l, 'wtype': 'lemma', 'lang': langID, 'l_order': lOrder, 'freq': self.wordFreqs[langID][lID], 'lemma_freq': self.wordFreqs[langID][lID], 'rank_true': lemmaFreqToRank[self.wordFreqs[langID][lID]], 'rank': self.quantile_label( self.wordFreqs[langID][lID], lemmaFreqToRank[self.wordFreqs[langID][lID]], quantiles), 'n_sents': self.wordSFreqs[langID][lID], 'n_docs': len(self.wordDIDs[langID][lID]), 'freq_join': 'word' } curAction = { '_index': self.name + '.words', '_id': lID, '_source': lemmaJson } iLemma += 1 yield curAction for docID in self.wordDIDs[langID][lID]: lfreqJson = { 'wtype': 'word_freq', 'l_id': lID, 'd_id': docID, 'l_order': lOrder, 'freq': self.wordDocFreqs[langID][(lID, docID)], 'freq_join': { 'name': 'word_freq', 'parent': lID } } curAction = { '_index': self.name + '.words', '_id': 'lfreq' + str(self.lemmaFreqID), '_source': lfreqJson, '_routing': lID } self.lemmaFreqID += 1 yield curAction def iterate_words(self): """ Iterate through all words collected at the previous stage. Return JSON objects with actions for bulk indexing in Elasticsearch. """ self.wID = 0 for langID in range(len(self.languages)): wfsSorted, lemmataSorted = self.sort_words(self.languages[langID]) iWord = 0 print('Processing words in ' + self.languages[langID] + '...') wFreqsSorted = [ self.wordFreqs[langID][itemID] for itemID in self.wordFreqs[langID] if itemID.startswith('w') ] wFreqsSorted.sort(reverse=True) wordFreqToRank, quantiles = self.get_freq_ranks(wFreqsSorted) lFreqsSorted = [ self.wordFreqs[langID][itemID] for itemID in self.wordFreqs[langID] if itemID.startswith('l') ] lFreqsSorted.sort(reverse=True) lemmaFreqToRank, lemmaQuantiles = self.get_freq_ranks(lFreqsSorted) # for wID in self.wordFreqs[langID]: for w, wID in self.tmpWordIDs[langID].items(): wID = 'w' + str(wID) if iWord % 500 == 0: print('indexing word', iWord) try: lID = self.word2lemma[langID][wID] except KeyError: lID = 'l0' wJson = json.loads(w) wfOrder = len(wfsSorted) + 1 if 'wf' in wJson: wfOrder = wfsSorted[wJson['wf']] lOrder = len(lemmataSorted) + 1 if 'ana' in wJson: lOrder = lemmataSorted[self.get_lemma(wJson)] wJson['wf_order'] = wfOrder wJson['l_order'] = lOrder wJson['l_id'] = lID wordFreq = self.wordFreqs[langID][wID] lemmaFreq = self.wordFreqs[langID][lID] wJson['freq'] = wordFreq wJson['lemma_freq'] = lemmaFreq # wJson['sids'] = [sid for sid in sorted(self.wordSIDs[langID][wID])] wJson['dids'] = [ did for did in sorted(self.wordDIDs[langID][wID]) ] wJson['n_sents'] = self.wordSFreqs[langID][wID] wJson['n_docs'] = len(wJson['dids']) wJson['rank_true'] = wordFreqToRank[ wJson['freq']] # for the calculations wJson['lemma_rank_true'] = lemmaFreqToRank[ self.wordFreqs[langID][lID]] # for the calculations wJson['rank'] = self.quantile_label(wJson['freq'], wJson['rank_true'], quantiles) # for the user wJson['freq_join'] = 'word' wJson['wtype'] = 'word' curAction = { '_index': self.name + '.words', '_id': wID, '_source': wJson } yield curAction for docID in wJson['dids']: wfreqJson = { 'wtype': 'word_freq', 'w_id': wID, 'l_id': lID, 'd_id': docID, 'wf_order': wfOrder, 'l_order': lOrder, 'freq': self.wordDocFreqs[langID][(wID, docID)], 'freq_join': { 'name': 'word_freq', 'parent': wID } } curAction = { '_index': self.name + '.words', '_id': 'wfreq' + str(self.wordFreqID), '_source': wfreqJson, '_routing': wID } self.wordFreqID += 1 yield curAction iWord += 1 self.wID += 1 for lAction in self.iterate_lemmata(langID, lemmataSorted): yield lAction emptyLemmaJson = { 'wf': '', 'wtype': 'lemma', 'freq': 0, 'rank_true': -1 } curAction = { '_index': self.name + '.words', '_id': 'l0', # l prefix stands for "lemma" '_source': emptyLemmaJson } yield curAction self.wfs = None self.lemmata = None def generate_dictionary(self): """ For each language, print out an HTML dictionary containing all lexemes of the corpus. """ for langID in range(len(self.languages)): iWord = 0 print('Generating dictionary for ' + self.languages[langID] + '...') lexFreqs = {} # lemma ID -> its frequency wFreqsSorted = [ v for v in sorted(self.wordFreqs[langID].values(), reverse=True) ] freqToRank, quantiles = self.get_freq_ranks(wFreqsSorted) # for wID in self.wordFreqs[langID]: for w, wID in self.tmpWordIDs[langID].items(): wID = 'w' + str(wID) if iWord % 1000 == 0: print('processing word', iWord, 'for the dictionary') iWord += 1 wJson = json.loads(w) if 'ana' not in wJson or len(wJson['ana']) <= 0: continue lemma = self.get_lemma(wJson, lower_lemma=False) grdic, translations = self.get_grdic(wJson, self.languages[langID]) wordFreq = self.wordFreqs[langID][wID] lexTuple = (lemma, grdic, translations) if lexTuple not in lexFreqs: lexFreqs[lexTuple] = wordFreq else: lexFreqs[lexTuple] += wordFreq if len(lexFreqs) <= 0: continue if not os.path.exists('../search/web_app/templates/dictionaries'): os.makedirs('../search/web_app/templates/dictionaries') fOut = open(os.path.join( '../search/web_app/templates/dictionaries', 'dictionary_' + self.settings['corpus_name'] + '_' + self.languages[langID] + '.html'), 'w', encoding='utf-8') fOut.write( '<h1 class="dictionary_header"> {{ _(\'Dictionary_header\') }} ' '({{ _(\'langname_' + self.languages[langID] + '\') }})</h1>\n') prevLetter = '' sortingFunction = self.make_sorting_function( self.settings['languages'][langID]) for lemma, grdic, trans in sorted( lexFreqs, key=lambda x: (sortingFunction(x[0].lower()), -lexFreqs[x])): if len(lemma) <= 0: continue mChar = self.character_regex(self.languages[langID]).search( lemma.lower()) if mChar is None: curLetter = '*' else: curLetter = mChar.group(0) if curLetter != prevLetter: if prevLetter != '': fOut.write('</tbody>\n</table>\n') fOut.write('<h2 class="dictionary_letter">' + curLetter.upper() + '</h2>\n') fOut.write('<table class="dictionary_table">\n<thead>\n' '<th>{{ _(\'word_th_lemma\') }}</th>' '<th>{{ _(\'word_th_gr\') }}</th>' '<th>{{ _(\'word_th_trans_en\') }}</th>' '<th>{{ _(\'word_th_frequency\') }}</th>' '</thead>\n<tbody>\n') prevLetter = curLetter fOut.write('<tr>\n<td class="dictionary_lemma">' + lemma + '</td><td>' + grdic + '</td>' '<td>' + trans + '</td><td>' + str(lexFreqs[(lemma, grdic, trans)]) + '</td></tr>\n') if prevLetter != '': fOut.write('</tbody>\n</table>\n') fOut.close() def index_words(self): """ Index all words that have been collected at the previous stage in self.words (while the sentences were being indexed). """ bulk(self.es, self.iterate_words(), chunk_size=300, request_timeout=60) if 'generate_dictionary' in self.settings and self.settings[ 'generate_dictionary']: self.generate_dictionary() def add_parallel_sids(self, sentences, paraIDs): """ In the parallel corpus, add the IDs of aligned sentences in other languages to each sentence that has a para_id. """ for s in sentences: if 'para_alignment' not in s['_source'] or 'lang' not in s[ '_source']: continue langID = s['_source']['lang'] for pa in s['_source']['para_alignment']: paraID = pa['para_id'] pa['sent_ids'] = [] for i in range(len(self.languages)): if i == langID: continue if paraID in paraIDs[i]: pa['sent_ids'] += paraIDs[i][paraID] def iterate_sentences(self, fname): self.numSents = 0 prevLast = False sentences = [] paraIDs = [{} for i in range(len(self.languages))] for s, bLast in self.iterSent.get_sentences(fname): if 'lang' in s: langID = s['lang'] else: langID = 0 s['lang'] = langID s['n_words'] = 0 if 'words' in s: sentAnaMeta = self.process_sentence_words(s['words'], langID) s['n_words'] = sum(1 for w in s['words'] if 'wtype' in w and w['wtype'] == 'word') if 'meta' not in s: s['meta'] = {} s['meta']['sent_analyses'] = sentAnaMeta if prevLast: prevLast = False elif self.numSents > 0: s['prev_id'] = self.randomize_id(self.sID - 1) if not bLast and 'last' not in s: s['next_id'] = self.randomize_id(self.sID + 1) else: prevLast = True s['doc_id'] = self.dID if 'meta' in s: for metaField in [ mf for mf in s['meta'].keys() if not (mf.startswith('year') or mf.endswith('_kw')) ]: s['meta'][metaField + '_kw'] = s['meta'][metaField] # self.es.index(index=self.name + '.sentences', # id=self.sID, # body=s) curAction = { '_index': self.name + '.sentences', '_id': self.randomize_id(self.sID), '_source': s } if len(self.languages) <= 1: yield curAction else: sentences.append(curAction) if 'para_alignment' in s: s['para_ids'] = [] for pa in s['para_alignment']: paraID = str(self.dID) + '_' + str(pa['para_id']) pa['para_id'] = paraID s['para_ids'].append(paraID) try: paraIDs[langID][paraID].append( self.randomize_id(self.sID)) except KeyError: paraIDs[langID][paraID] = [ self.randomize_id(self.sID) ] if self.sID % 500 == 0: print('Indexing sentence', self.sID, ',', self.totalNumWords, 'words so far.') self.numSents += 1 self.numSentsLang[langID] += 1 self.sID += 1 if len(self.languages) > 1: self.add_parallel_sids(sentences, paraIDs) for s in sentences: yield s @staticmethod def add_meta_keywords(meta): """ For each text field in the metadata, add a keyword version of the same field. """ for field in [k for k in meta.keys() if not k.startswith('year')]: meta[field + '_kw'] = meta[field] def index_doc(self, fname): """ Store the metadata of the source file. """ if self.dID % 100 == 0: print('Indexing document', self.dID) meta = self.iterSent.get_metadata(fname) self.add_meta_keywords(meta) meta['n_words'] = self.numWords meta['n_sents'] = self.numSents if len(self.settings['languages']) > 1: for i in range(len(self.languages)): meta['n_words_' + self.languages[i]] = self.numWordsLang[i] meta['n_sents_' + self.languages[i]] = self.numSentsLang[i] self.numWords = 0 self.numSents = 0 self.numWordsLang = [0] * len(self.languages) self.numSentsLang = [0] * len(self.languages) try: self.es.index(index=self.name + '.docs', id=self.dID, body=meta) except RequestError as err: print('Metadata error: {0}'.format(err)) shortMeta = {} if 'filename' in meta: shortMeta['filename'] = meta['filename'] if 'title' in meta: shortMeta['title'] = meta['title'] shortMeta['title_kw'] = meta['title'] self.es.index(index=self.name + '.docs', id=self.dID, body=shortMeta) if ('fulltext_view_enabled' in self.settings and self.settings['fulltext_view_enabled'] and 'fulltext_id' in meta): fnameOut = meta['fulltext_id'] + '.json' self.j2h.process_file( fname, os.path.join('../search/corpus_html', self.name, fnameOut)) self.dID += 1 def index_dir(self): """ Index all files from the corpus directory, sorted by their size in decreasing order. Such sorting helps prevent memory errors when indexing large corpora, as the default behavior is to load the whole file is into memory, and there is more free memory in the beginning of the process. If MemoryError occurs, the iterative JSON parser is used, which works much slower. """ filenames = [] for root, dirs, files in os.walk(self.corpus_dir): for fname in files: if (not ((self.settings['input_format'] == 'json' and fname.lower().endswith('.json')) or (self.settings['input_format'] == 'json-gzip' and fname.lower().endswith('.json.gz')))): continue fnameFull = os.path.join(root, fname) filenames.append((fnameFull, os.path.getsize(fnameFull))) if len(filenames) <= 0: print('There are no files in this corpus.') return for fname, fsize in sorted(filenames, key=lambda p: -p[1]): # print(fname, fsize) if 'sample_size' in self.settings and 0 < self.settings[ 'sample_size'] < 1: # Only take a random sample of the source files (for test purposes) if random.random() > self.settings['sample_size']: continue bulk(self.es, self.iterate_sentences(fname), chunk_size=200, request_timeout=60) self.index_doc(fname) self.index_words() def compile_translations(self): """ Compile flask_babel translations in ../search/web_app. """ pythonPath = '' for p in sys.path: if re.search('Python3[^/\\\\]*[/\\\\]?$', p) is not None: pythonPath = p break if len(pythonPath) <= 0: pyBabelPath = 'pybabel' else: pyBabelPath = os.path.join(pythonPath, 'Scripts', 'pybabel') try: subprocess.run( [pyBabelPath, 'compile', '-d', 'translations_pybabel'], cwd='../search/web_app', check=True) except: print('Could not compile translations with ' + pyBabelPath + ' .') else: print('Interface translations compiled.') def load_corpus(self): """ Drop the current database, if any, and load the entire corpus. """ t1 = time.time() # self.compile_translations() indicesDeleted = self.delete_indices() if not indicesDeleted: return self.create_indices() self.index_dir() t2 = time.time() print('Corpus indexed in', t2 - t1, 'seconds:', self.dID, 'documents,', self.sID, 'sentences,', self.totalNumWords, 'words,', sum(len(self.wordFreqs[i]) for i in range(len(self.languages))), 'word types (different words).')
class Indexator: """ Contains methods for loading the JSON documents in the corpus database. """ SETTINGS_DIR = '../conf' def __init__(self): f = open(os.path.join(self.SETTINGS_DIR, 'corpus.json'), 'r', encoding='utf-8') self.settings = json.loads(f.read()) f.close() self.name = self.settings['corpus_name'] self.languages = self.settings['languages'] if len(self.languages) <= 0: self.languages = [self.name] self.input_format = self.settings['input_format'] self.corpus_dir = os.path.join('../corpus', self.name) self.iterSent = None if self.input_format in ['json', 'json-gzip']: self.iterSent = JSONDocReader(format=self.input_format) self.goodWordFields = [ 'lex', 'wf', 'wf_display', 'parts', 'gloss', 'gloss_index', 'n_ana', 'trans_en', 'trans_ru' ] self.AdditionalWordFields = set() if 'word_fields' in self.settings: self.AdditionalWordFields |= set(self.settings['word_fields']) if 'word_table_fields' in self.settings: self.AdditionalWordFields |= set( self.settings['word_table_fields']) f = open(os.path.join(self.SETTINGS_DIR, 'categories.json'), 'r', encoding='utf-8') categories = json.loads(f.read()) self.goodWordFields += [ 'gr.' + v for lang in categories for v in categories[lang].values() ] self.goodWordFields = set(self.goodWordFields) f.close() self.pd = PrepareData() self.es = Elasticsearch() self.es_ic = IndicesClient(self.es) self.shuffled_ids = [i for i in range(1, 1000000)] random.shuffle(self.shuffled_ids) self.shuffled_ids.insert(0, 0) # id=0 is special and should not change self.tmpWordIDs = [{} for i in range(len(self.languages)) ] # word as JSON -> its integer ID self.tmpLemmaIDs = [{} for i in range(len(self.languages)) ] # lemma as string -> its integer ID self.word2lemma = [{} for i in range(len(self.languages)) ] # word's ID -> ID of its lemma (or -1, if none) self.wordFreqs = [{} for i in range(len(self.languages)) ] # word's ID -> its frequency self.wordSFreqs = [{} for i in range(len(self.languages)) ] # word's ID -> its number of sentences self.wordDocFreqs = [ {} for i in range(len(self.languages)) ] # (word's ID, dID) -> word frequency in the document # self.wordSIDs = [{} for i in range(len(self.languages))] # word's ID -> set of sentence IDs self.wordDIDs = [{} for i in range(len(self.languages)) ] # word's ID -> set of document IDs self.wfs = set() # set of word forms (for sorting) self.lemmata = set() # set of lemmata (for sorting) self.sID = 0 # current sentence ID for each language self.dID = 0 # current document ID self.wID = 0 # current word ID self.wordFreqID = 0 self.numWords = 0 # number of words in current document self.numSents = 0 # number of sentences in current document self.numWordsLang = [0] * len( self.languages ) # number of words in each language in current document self.numSentsLang = [0] * len( self.languages ) # number of sentences in each language in current document self.totalNumWords = 0 def delete_indices(self): if self.es_ic.exists(index=self.name + '.docs'): self.es_ic.delete(index=self.name + '.docs') if self.es_ic.exists(index=self.name + '.words'): self.es_ic.delete(index=self.name + '.words') if self.es_ic.exists(index=self.name + '.word_freqs'): self.es_ic.delete(index=self.name + '.word_freqs') if self.es_ic.exists(index=self.name + '.sentences'): self.es_ic.delete(index=self.name + '.sentences') def create_indices(self): self.wordMapping = self.pd.generate_words_mapping() self.wordFreqMapping = self.pd.generate_wordfreq_mapping() self.sentMapping = self.pd.generate_sentences_mapping(self.wordMapping) self.docMapping = self.pd.generate_docs_mapping() self.es_ic.create(index=self.name + '.docs', body=self.docMapping) self.es_ic.create(index=self.name + '.words', body=self.wordMapping) self.es_ic.create(index=self.name + '.sentences', body=self.sentMapping) def randomize_id(self, realID): """ Return a (relatively) randomized sentence ID. This randomization is needed in context-aware word queries where the sentences are iterated in the order determined by their IDs. """ if realID < 0: return realID idStart, idEnd = realID // 1000000, realID % 1000000 return idStart * 1000000 + self.shuffled_ids[idEnd] def enhance_word(self, word): """ Add some calculated fields to the JSON word. """ if 'ana' not in word: word['n_ana'] = 0 else: word['n_ana'] = len(word['ana']) if word['n_ana'] >= 127: word['n_ana'] = 127 def process_sentence_words(self, words, langID): """ Take words list from a sentence, remove all non-searchable fields from them and add them to self.words dictionary. Add w_id property to each word of the words list. """ sIDAdded = set( ) # word IDs for which the current settence ID has been counted for it for w in words: if w['wtype'] != 'word': continue self.numWords += 1 self.numWordsLang[langID] += 1 self.totalNumWords += 1 self.enhance_word(w) wClean = {'lang': langID} lemma = '' for field in w: if field in self.goodWordFields or field in self.AdditionalWordFields: wClean[field] = w[field] if field == 'wf': if 'wf_lowercase' not in self.settings or self.settings[ 'wf_lowercase']: wClean[field] = wClean[field].lower() self.wfs.add(wClean[field]) if 'ana' in w: lemma = self.get_lemma(w) self.lemmata.add(lemma) wClean['ana'] = [] for ana in w['ana']: cleanAna = {} for anaField in ana: if anaField in self.goodWordFields or anaField in self.AdditionalWordFields: cleanAna[anaField] = ana[anaField] wClean['ana'].append(cleanAna) wCleanTxt = json.dumps(wClean, ensure_ascii=False, sort_keys=True) if wCleanTxt in self.tmpWordIDs[langID]: wID = self.tmpWordIDs[langID][wCleanTxt] else: wID = sum( len(self.tmpWordIDs[i]) for i in range(len(self.languages))) self.tmpWordIDs[langID][wCleanTxt] = wID w['w_id'] = wID if len(lemma) > 0: try: lemmaID = self.tmpLemmaIDs[langID][lemma] except KeyError: lemmaID = sum( len(self.tmpLemmaIDs[i]) for i in range(len(self.languages))) + 1 self.tmpLemmaIDs[langID][lemma] = lemmaID self.word2lemma[langID][wID] = lemmaID try: self.wordFreqs[langID][wID] += 1 except KeyError: self.wordFreqs[langID][wID] = 1 if wID not in sIDAdded: sIDAdded.add(wID) try: self.wordSFreqs[langID][wID] += 1 except KeyError: self.wordSFreqs[langID][wID] = 1 try: self.wordDIDs[langID][wID].add(self.dID) except KeyError: self.wordDIDs[langID][wID] = {self.dID} try: self.wordDocFreqs[langID][(wID, self.dID)] += 1 except KeyError: self.wordDocFreqs[langID][(wID, self.dID)] = 1 def sort_words(self): """ Sort word forms and lemmata stored at earlier stages. Return dictionaries with positions of word forms and lemmata in the sorted list. Delete the original lists. """ wfsSorted = {} iOrder = 0 for wf in sorted(self.wfs): wfsSorted[wf] = iOrder iOrder += 1 self.wfs = None lemmataSorted = {} iOrder = 0 for l in sorted(self.lemmata): lemmataSorted[l] = iOrder iOrder += 1 self.lemmata = None return wfsSorted, lemmataSorted def get_freq_ranks(self, freqsSorted): """ Calculate frequency ranks and rank/quantile labels for words or lemmata. """ freqToRank = {} quantiles = {} prevFreq = 0 prevRank = 0 for i in range(len(freqsSorted)): v = freqsSorted[i] if v != prevFreq: if prevFreq != 0: freqToRank[prevFreq] = prevRank + (i - prevRank) // 2 prevRank = i prevFreq = v if prevFreq != 0: freqToRank[prevFreq] = prevRank + (len(freqsSorted) - prevRank) // 2 for q in [0.03, 0.04, 0.05, 0.1, 0.15, 0.2, 0.25, 0.5]: qIndex = math.ceil(q * len(freqsSorted)) if qIndex >= len(freqsSorted): qIndex = len(freqsSorted) - 1 if qIndex >= 0: quantiles[q] = freqsSorted[qIndex] else: quantiles[q] = 0 return freqToRank, quantiles def quantile_label(self, freq, rank, quantiles): """ Return a string label of the frequency rank (for frequent items) or quantile. This label is showed to the user in word query results. """ if freq > 1 and freq >= quantiles[0.5]: if freq > quantiles[0.03]: return '#' + str(rank + 1) else: return '> ' + str( min( math.ceil(q * 100) for q in quantiles if freq >= quantiles[q])) + '%' return '' def get_lemma(self, word): """ Join all lemmata in the JSON representation of a word with an analysis and return them as a string. """ if 'ana' not in word: return '' if 'keep_lemma_order' not in self.settings or not self.settings[ 'keep_lemma_order']: curLemmata = set() for ana in word['ana']: if 'lex' in ana: if type(ana['lex']) == list: for l in ana['lex']: curLemmata.add(l.lower()) else: curLemmata.add(ana['lex'].lower()) return '/'.join(l for l in sorted(curLemmata)) curLemmata = [] for ana in word['ana']: if 'lex' in ana: if type(ana['lex']) == list: for l in ana['lex']: curLemmata.append(l.lower()) else: curLemmata.append(ana['lex'].lower()) return '/'.join(curLemmata) def iterate_lemmata(self, langID, lemmaFreqs, lemmaDIDs): """ Iterate over all lemmata for one language collected at the word iteration stage. """ lFreqsSorted = [v for v in sorted(lemmaFreqs.values(), reverse=True)] freqToRank, quantiles = self.get_freq_ranks(lFreqsSorted) iLemma = 0 for l, lID in self.tmpLemmaIDs[langID].items(): #if iLemma % 250 == 0: # print('indexing lemma', iLemma) lemmaJson = { 'wf': l, 'freq': lemmaFreqs[lID], 'rank_true': freqToRank[lemmaFreqs[lID]], 'rank': self.quantile_label(lemmaFreqs[lID], freqToRank[lemmaFreqs[lID]], quantiles), 'n_docs': len(lemmaDIDs[lID]) } curAction = { '_index': self.name + '.words', '_type': 'lemma', '_id': lID, '_source': lemmaJson } iLemma += 1 yield curAction def iterate_words(self): """ Iterate through all words collected at the previous stage. Return JSON objects with actions for bulk indexing in Elasticsearch. """ self.wID = 0 wfsSorted, lemmataSorted = self.sort_words() for langID in range(len(self.languages)): iWord = 0 #print('Processing words in ' + self.languages[langID] + '...') lemmaFreqs = {} # lemma ID -> its frequency lemmaDIDs = {} # lemma ID -> its document IDs wFreqsSorted = [ v for v in sorted(self.wordFreqs[langID].values(), reverse=True) ] freqToRank, quantiles = self.get_freq_ranks(wFreqsSorted) # for wID in self.wordFreqs[langID]: for w, wID in self.tmpWordIDs[langID].items(): #if iWord % 500 == 0: # print('indexing word', iWord) try: lID = self.word2lemma[langID][wID] except KeyError: lID = 0 wJson = json.loads(w) wfOrder = len(wfsSorted) + 1 if 'wf' in wJson: wfOrder = wfsSorted[wJson['wf']] lOrder = len(lemmataSorted) + 1 if 'ana' in wJson: lOrder = lemmataSorted[self.get_lemma(wJson)] wJson['wf_order'] = wfOrder wJson['l_order'] = lOrder wordFreq = self.wordFreqs[langID][wID] wJson['freq'] = wordFreq try: lemmaFreqs[lID] += wordFreq except KeyError: lemmaFreqs[lID] = wordFreq if lID != 0: try: lemmaDIDs[lID] |= self.wordDIDs[langID][wID] except KeyError: lemmaDIDs[lID] = set(self.wordDIDs[langID][wID]) # wJson['sids'] = [sid for sid in sorted(self.wordSIDs[langID][wID])] wJson['dids'] = [ did for did in sorted(self.wordDIDs[langID][wID]) ] wJson['n_sents'] = self.wordSFreqs[langID][wID] wJson['n_docs'] = len(wJson['dids']) wJson['rank_true'] = freqToRank[ wJson['freq']] # for the calculations wJson['rank'] = self.quantile_label(wJson['freq'], wJson['rank_true'], quantiles) # for the user curAction = { '_index': self.name + '.words', '_type': 'word', '_id': wID, '_source': wJson, '_parent': lID } yield curAction for docID in wJson['dids']: wfreqJson = { 'w_id': wID, 'd_id': docID, 'wf_order': wfOrder, 'l_order': lOrder, 'freq': self.wordDocFreqs[langID][(wID, docID)] } curAction = { '_index': self.name + '.words', '_type': 'word_freq', '_id': self.wordFreqID, '_source': wfreqJson, '_parent': wID, '_routing': lID } self.wordFreqID += 1 yield curAction iWord += 1 self.wID += 1 for lAction in self.iterate_lemmata(langID, lemmaFreqs, lemmaDIDs): yield lAction emptyLemmaJson = {'wf': 0, 'freq': 0, 'rank_true': -1} curAction = { '_index': self.name + '.words', '_type': 'lemma', '_id': 0, '_source': emptyLemmaJson } yield curAction def index_words(self): """ Index all words that have been collected at the previous stage in self.words (while the sentences were being indexed). """ bulk(self.es, self.iterate_words(), chunk_size=300, request_timeout=60) def add_parallel_sids(self, sentences, paraIDs): """ In the parallel corpus, add the IDs of aligned sentences in other languages to each sentence that has a para_id. """ for s in sentences: if 'para_alignment' not in s['_source'] or 'lang' not in s[ '_source']: continue langID = s['_source']['lang'] for pa in s['_source']['para_alignment']: paraID = pa['para_id'] pa['sent_ids'] = [] for i in range(len(self.languages)): if i == langID: continue if paraID in paraIDs[i]: pa['sent_ids'] += paraIDs[i][paraID] def iterate_sentences(self, fname): self.numSents = 0 prevLast = False sentences = [] paraIDs = [{} for i in range(len(self.languages))] for s, bLast in self.iterSent.get_sentences(fname): if 'lang' in s: langID = s['lang'] else: langID = 0 s['lang'] = langID s['n_words'] = 0 if 'words' in s: self.process_sentence_words(s['words'], langID) s['n_words'] = sum(1 for w in s['words'] if 'wtype' in w and w['wtype'] == 'word') if prevLast: prevLast = False elif self.numSents > 0: s['prev_id'] = self.randomize_id(self.sID - 1) if not bLast and 'last' not in s: s['next_id'] = self.randomize_id(self.sID + 1) else: prevLast = True s['doc_id'] = self.dID if 'meta' in s: for metaField in [ mf for mf in s['meta'].keys() if not mf.startswith('year') ]: s['meta'][metaField + '_kw'] = s['meta'][metaField] # self.es.index(index=self.name + '.sentences', # doc_type='sentence', # id=self.sID, # body=s) curAction = { '_index': self.name + '.sentences', '_type': 'sentence', '_id': self.randomize_id(self.sID), '_source': s } if len(self.languages) <= 1: yield curAction else: sentences.append(curAction) if 'para_alignment' in s: s['para_ids'] = [] for pa in s['para_alignment']: paraID = str(self.dID) + '_' + str(pa['para_id']) pa['para_id'] = paraID s['para_ids'].append(paraID) try: paraIDs[langID][paraID].append( self.randomize_id(self.sID)) except KeyError: paraIDs[langID][paraID] = [ self.randomize_id(self.sID) ] #if self.sID % 500 == 0: # print('Indexing sentence', self.sID, ',', self.totalNumWords, 'words so far.') self.numSents += 1 self.numSentsLang[langID] += 1 self.sID += 1 if len(self.languages) > 1: self.add_parallel_sids(sentences, paraIDs) for s in sentences: yield s @staticmethod def add_meta_keywords(meta): """ For each text field in the metadata, add a keyword version of the same field. """ for field in [k for k in meta.keys() if not k.startswith('year')]: meta[field + '_kw'] = meta[field] def index_doc(self, fname): """ Store the metadata of the source file. """ #if self.dID % 100 == 0: #print('Indexing document', self.dID) meta = self.iterSent.get_metadata(fname) self.add_meta_keywords(meta) meta['n_words'] = self.numWords meta['n_sents'] = self.numSents if len(self.settings['languages']) > 1: for i in range(len(self.languages)): meta['n_words_' + self.languages[i]] = self.numWordsLang[i] meta['n_sents_' + self.languages[i]] = self.numSentsLang[i] self.numWords = 0 self.numSents = 0 self.numWordsLang = [0] * len(self.languages) self.numSentsLang = [0] * len(self.languages) try: self.es.index(index=self.name + '.docs', doc_type='doc', id=self.dID, body=meta) except RequestError as err: #print('Metadata error: {0}'.format(err)) shortMeta = {} if 'filename' in meta: shortMeta['filename'] = meta['filename'] if 'title' in meta: shortMeta['title'] = meta['title'] shortMeta['title_kw'] = meta['title'] self.es.index(index=self.name + '.docs', doc_type='doc', id=self.dID, body=shortMeta) self.dID += 1 def index_dir(self): """ Index all files from the corpus directory, sorted by their size in decreasing order. Such sorting helps prevent memory errors when indexing large corpora, as the default behavior is to load the whole file is into memory, and there is more free memory in the beginning of the process. If MemoryError occurs, the iterative JSON parser is used, which works much slower. """ filenames = [] for root, dirs, files in os.walk(self.corpus_dir): for fname in files: if (not ((self.settings['input_format'] == 'json' and fname.lower().endswith('.json')) or (self.settings['input_format'] == 'json-gzip' and fname.lower().endswith('.json.gz')))): continue fnameFull = os.path.join(root, fname) filenames.append((fnameFull, os.path.getsize(fnameFull))) if len(filenames) <= 0: print('There are no files in this corpus.') return for fname, fsize in sorted(filenames, key=lambda p: -p[1]): # print(fname, fsize) bulk(self.es, self.iterate_sentences(fname), chunk_size=200, request_timeout=60) self.index_doc(fname) self.index_words() def compile_translations(self): """ Compile flask_babel translations in ../search/web_app. """ pythonPath = '' for p in sys.path: if re.search('Python3[^/\\\\]*[/\\\\]?$', p) is not None: pythonPath = p break if len(pythonPath) <= 0: pyBabelPath = 'pybabel' else: pyBabelPath = os.path.join(pythonPath, 'Scripts', 'pybabel') try: subprocess.run([pyBabelPath, 'compile', '-d', 'translations'], cwd='../search/web_app', check=True) except: print('Could not compile translations with ' + pyBabelPath + ' .') else: print('Interface translations compiled.') def load_corpus(self): """ Drop the current database, if any, and load the entire corpus. """ t1 = time.time() self.compile_translations() self.delete_indices() self.create_indices() self.index_dir() t2 = time.time() print('Corpus indexed in', t2 - t1, 'seconds:', self.dID, 'documents,', self.sID, 'sentences,', self.totalNumWords, 'words,', sum(len(self.wordFreqs[i]) for i in range(len(self.languages))), 'word types (different words).')
def run_training(config_file, load_filename=""): # read data config = Config.from_json_file(config_file) save_dir = config.save_dir datafile = config.datafile corpus_dir = config.corpus_dir prepare_data = PrepareData(min_count=config.MIN_COUNT, max_length=config.MAX_LENGTH) vocab, pairs = prepare_data.load_prepare_data(corpus_dir, datafile, save_dir) # set checkpoint to load from; set to None if starting from scratch # load_filename = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size), # '{}_checkpoint.tar'.format(checkpoint_iter)) if load_filename: # if loading on the same machine the model trained on checkpoint = torch.load(load_filename) # if loading a model trained on gpu to cpu # checkpoint = torch.load(load_filename, map_location=torch.device('cpu')) encoder_sd = checkpoint["en"] decoder_sd = checkpoint["de"] encoder_optimizer_sd = checkpoint["en_opt"] decoder_optimizer_sd = checkpoint["de_opt"] embedding_sd = checkpoint["embedding"] vocab.__dict__ = checkpoint["voc_dict"] print("Building encoder and decoder ...") # initialize word embeddings embedding = nn.Embedding(vocab.num_words, config.hidden_size).to(config.device) if load_filename: embedding.load_state_dict(embedding_sd) # initialize encoder and decoder models encoder = EncoderRNN(config.hidden_size, embedding, config.encoder_n_layers, config.dropout).to(config.device) decoder = LuongAttnDecoderRNN(config.attn_model, embedding, config.hidden_size, vocab.num_words, config.decoder_n_layers, config.dropout).to(config.device) if load_filename: encoder.load_state_dict(encoder_sd) decoder.load_state_dict(decoder_sd) print("Models built and ready to go.") ##################################### # ensure dropout layers are in train mode encoder.train() decoder.train() # initilize optimizers print("building optimizers") encoder_optimizer = optim.Adam(encoder.parameters(), lr=config.learning_rate) decoder_optimizer = optim.Adam(decoder.parameters(), lr=config.learning_rate * config.decoder_learning_ratio) if load_filename: encoder_optimizer.load_state_dict(encoder_optimizer_sd) decoder_optimizer.load_state_dict(decoder_optimizer_sd) # run training iterations training_iters(config, vocab, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, save_dir, load_filename)
import pandas as pd import numpy as np import matplotlib.pyplot as plt from prepare_data import PrepareData from pprint import pprint if __name__ == "__main__": p = PrepareData() result = p.process_all_gene() result = sorted(result, key=lambda x: x[1], reverse=True) index = [x[0] for x in result] value = [x[1] for x in result] # print([index, value]) df = pd.DataFrame( data=result, columns=["index", "value"], ) df.to_csv("./result/3.csv", index=False) ## Plot plt.figure() plt.bar(index, value, width=.2, color='g') plt.title("Gene Importance Bar Image") plt.xlabel("Gene number") plt.ylabel("Gene importance") plt.savefig('./result/3.png') plt.close()
def main(): # 数据预处理 # dataList = [] # for i in range(3,11): # dataList.append(PrepareData()) # args.train_file = '/Users/wangyihao/Pycharm/transformer-simple-master_new/data/data' + str(i) + '.p' data = PrepareData() args.src_vocab = len(data.en_word_dict) args.tgt_vocab = len(data.cn_word_dict) print("src_vocab %d" % args.src_vocab) print("tgt_vocab %d" % args.tgt_vocab) # 初始化模型 model = make_model(args.src_vocab, args.tgt_vocab, args.layers, args.d_model, args.d_ff, args.h_num, args.dropout) if args.type == 'train': # 训练 print(">>>>>>> start train") criterion = LabelSmoothing(args.tgt_vocab, padding_idx=0, smoothing=0.0) optimizer = NoamOpt( args.d_model, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) train(data, model, criterion, optimizer) print("<<<<<<< finished train") elif args.type == "evaluate": # 预测 # 先判断模型有没有训练好(前提) if os.path.exists(args.save_file): # 加载模型 model.load_state_dict(torch.load(args.save_file)) # 开始预测 print(">>>>>>> start evaluate") precision = evaluate(data, model) TP_total = precision.sum(axis=0)[0] FP_total = precision.sum(axis=0)[1] TN_total = precision.sum(axis=0)[2] FN_total = precision.sum(axis=0)[3] TPR = TP_total / (TP_total + FN_total) #计算真正率 TNR = TN_total / (TN_total + FP_total) #计算真负率 print( 'total true positive amount: %.3f, total false negative amount: %.3f' % (TP_total, FN_total)) print( 'total true negative amount: %.3f, total false positive amount: %.3f' % (TN_total, FP_total)) print('symbol within feature TPR: %.3f, delimiter TNR: %.3f' % (TPR, TNR)) print("<<<<<<< finished evaluate") else: print("Error: pleas train before evaluate") elif args.type == "predict": #输入特征并预测 if os.path.exists(args.save_file): # 加载模型 model.load_state_dict(torch.load(args.save_file)) # 开始预测 print(">>>>>>> start predict") translation = predict(data, model) print("<<<<<<< finished predict") else: print("Error: please select type within [train / evaluate / predict]")