def load_data_multiable_new(word2index,word2index_label,valid_portion=0.05,max_training_data=1e6, training_data_path='train-zhihu4-only-title-all.txt',multi_label_flag=True,use_seq2seq=False,seq2seq_label_length=6): # 加载 zhihu data from file print('load_data.started...') print('load_data_multi_newtraining_data_path:',training_data_path) zhihu_f = codes.open(training_data_path,'r','urf8') lines = zhihu_f.readlines() X = [] Y = [] Y_decoder_input = [] for i, line in enumerate(lines): x, y = line.split('__label__') y = y.strip().replace('\n','') x = x.strip() if i < 1: print(i,'x0:',x) if use_seq2seq: ys = y.replace('\n','').split(" ") _PAD_INDEX = word2index_label[_PAD] ys_multihot_list = [_PAD_INDEX] * seq2seq_label_length ys_decoder_input = [_PAD_INDEX] * seq2seq_label_length for j, y in enumerate(ys): if j < seq2seq_label_length - 1 : ys_multihot_list[j] = word2index_label if len(ys) > seq2seq_label_length - 1: ys_multihot_list[seq2seq_label_length - 1] = word2index_label[_END] else: ys_multihot_list[len(ys)] = word2index_label[_END] ys_decoder_input[0] = word2index_label[_GO] for j, y in enumerate(ys): if j < seq2seq_label_length - 1: ys_multihot_list[j] = word2index_label[y] if i < 10 : print(i,'ys:======>0',ys) print(i,'ys_multihot_list:=======>1',ys_multihot_list) print(i,'ys_decoder_input:=======>2',ys_decoder_input) else: if multi_label_flag: ys = y.replace('\n','').split(' ') ys_index = [] for y in ys: y_index = word2index[y] ys_index.append(y_index) ys_multihot_list = transform_multiable_as_multihot(ys_index) else: ys_multihot_list = word2index_label[y] if i <= 3: print('ys_index:') print(i,'y:',y,' ;ys_multihot_list:',ys_multihot_list) X.append(x) Y.append(ys_multihot_list) if use_seq2seq: Y_decoder_input.append(ys_decoder_input) number_examples = len(X) print('number_examples:',number_examples) train = (X[:int((1-valid_portion) * number_examples)],Y[:int((1 - valid_portion) * number_examples)])
def create_vocabulary_label(vocabulary_label='./data/train-zhihu4-only-title-all.txt',name_scope='',use_seq2seq=False): print('create_vocabulary_labrl_storted.started.training_data_path:',vocablary_label) cache_path = 'cache_vocabulary_label_pik/' + name_scope + '_label_vocabulary.pik' if os.path.exists(cache_path): with open(cache_path,'rb') as data_f: word2index_label, index2word_label = pickle.load(data_f) return word2index_label, index2word_label else: zhihu_f_train = codes.open(vocabulary_label,'r','utf8') lines = zhihu_f_train.readlines() count = 0 word2index_label = {} index2word_label = {} label_count_dict = {} for i,line in enumerate(lines): if '__label__' in line: label = line[line.index('__label__') + len("__label__"):].strip().replace('\n','') if label_count_dict.get(label,None) is not None: label_count_dict[label] = label_count_dict[label] + 1 else: label_count_dict[label] = 1 # UNK = 1 list_label = sort_by_value(label_count_dict) print('length of list_label:',len(list_label)) count = 0 if use_seq2seq: i_list = [0,1,2] label_special_list = [_GO,_END,_PAD] for i, label in zip(i_list,label_special_list): word2index_label[label] = i index2word_label[i] = label for i, label in enumerate(list_label): if i < 10: count_value = label_count_dict[label] print('label:',label,'count_value:',count_value) count = count + count_value index = i + 3 if use_seq2seq else i word2index_label[label] = index index2word_label[index] = label print('count top10:',count) if not os.path.exists(cache_path): with open(cache_path,'ab') as data_f: pickle.dump((word2index_label,index2word_label),data_f) print('create_vocabulary_label_sorted.ended.len of vocvabulary_label:',len(index2word_label)) return word2index_label,index2word_label
def ReadFileSentiWordNet(filename): senti_word = [] senti_pos = [] senti_neg = [] file = codes.open(filename, 'r', 'utf-8') full_data = file.read().splitlines() for i in range(len(full_data)): # Với mỗi dòng trong sentiwordnet columns = full_data[i].split('\t') words = columns[4].split(' ') # Xét mỗi từ for i in range(len(words)): # Bỏ 2 ký tự cuối words[i] = words[i][:-2] # Xét coi có trong senti_word chưa, nếu chưa có thêm vào if (words[i] not in senti_word): senti_word.append(words[i]) senti_pos.append(float(columns[2])) senti_neg.append(float(columns[3])) return senti_word, senti_pos, senti_neg
def uopen(*args): return open(*args,encoding="UTF-8")
from setuptools import setup from codes import open from os import path here = path.abspath(path.dirname(__file__)) with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() setup( name='easy_contract', version='0.2.0', description='Easy contract-based programming', long_description=long_description, long_description_content_type='text/markdown', url='https://github.com/fxcqz/easy_contract', author='Matt Rawcliffe', classifiers=[ 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', 'Topic :: Software Development :: Quality Assurance', 'License :: OSI Approved :: MIT License', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.6', ], keywords='contract programming', py_modules=['easy_contract', 'test_contract'], )
from setuptools import setup, find_packages from codes import open import os here = os.path.abspath(os.path.dirname(__file__)) with open(os.path.join(here, 'README.rst', encoding='utf-8') as f: long_description = f.read() https://github.com/pypa/sampleproject/blob/master/setup.py