示例#1
0
def load_data_multiable_new(word2index,word2index_label,valid_portion=0.05,max_training_data=1e6,
                            training_data_path='train-zhihu4-only-title-all.txt',multi_label_flag=True,use_seq2seq=False,seq2seq_label_length=6):
    # 加载 zhihu data from file
    print('load_data.started...')
    print('load_data_multi_newtraining_data_path:',training_data_path)
    zhihu_f = codes.open(training_data_path,'r','urf8')
    lines = zhihu_f.readlines()
    X = []
    Y = []
    Y_decoder_input = []
    for i, line in enumerate(lines):
        x, y = line.split('__label__')
        y = y.strip().replace('\n','')
        x = x.strip()
        if i < 1:
            print(i,'x0:',x)
        if use_seq2seq:
            ys = y.replace('\n','').split(" ")
            _PAD_INDEX = word2index_label[_PAD]
            ys_multihot_list = [_PAD_INDEX] * seq2seq_label_length
            ys_decoder_input =  [_PAD_INDEX] * seq2seq_label_length
            for j, y in enumerate(ys):
                if j < seq2seq_label_length - 1 :
                    ys_multihot_list[j] = word2index_label
            if len(ys) > seq2seq_label_length - 1:
                ys_multihot_list[seq2seq_label_length - 1] = word2index_label[_END]
            else:
                ys_multihot_list[len(ys)] = word2index_label[_END]

            ys_decoder_input[0] = word2index_label[_GO]
            for j, y in enumerate(ys):
                if j < seq2seq_label_length - 1:
                    ys_multihot_list[j] = word2index_label[y]
            if i < 10 :
                print(i,'ys:======>0',ys)
                print(i,'ys_multihot_list:=======>1',ys_multihot_list)
                print(i,'ys_decoder_input:=======>2',ys_decoder_input)
        else:
            if multi_label_flag:
                ys = y.replace('\n','').split(' ')
                ys_index = []
                for y in ys:
                    y_index = word2index[y]
                    ys_index.append(y_index)
                ys_multihot_list = transform_multiable_as_multihot(ys_index)
            else:
                ys_multihot_list = word2index_label[y]
        if i <= 3:

            print('ys_index:')
            print(i,'y:',y,' ;ys_multihot_list:',ys_multihot_list)
            X.append(x)
            Y.append(ys_multihot_list)
            if use_seq2seq:
                Y_decoder_input.append(ys_decoder_input)
        number_examples = len(X)
        print('number_examples:',number_examples)
        train = (X[:int((1-valid_portion) * number_examples)],Y[:int((1 - valid_portion) * number_examples)])
示例#2
0
def create_vocabulary_label(vocabulary_label='./data/train-zhihu4-only-title-all.txt',name_scope='',use_seq2seq=False):
    print('create_vocabulary_labrl_storted.started.training_data_path:',vocablary_label)
    cache_path = 'cache_vocabulary_label_pik/' + name_scope + '_label_vocabulary.pik'
    if os.path.exists(cache_path):
        with open(cache_path,'rb') as data_f:
            word2index_label, index2word_label = pickle.load(data_f)
            return word2index_label, index2word_label
    else:
        zhihu_f_train = codes.open(vocabulary_label,'r','utf8')
        lines = zhihu_f_train.readlines()
        count = 0
        word2index_label = {}
        index2word_label = {}
        label_count_dict = {}
        for i,line in enumerate(lines):
            if '__label__' in line:
                label = line[line.index('__label__') + len("__label__"):].strip().replace('\n','')
                if label_count_dict.get(label,None) is not None:
                    label_count_dict[label] = label_count_dict[label] + 1
                else:
                    label_count_dict[label] = 1 # UNK = 1
        list_label = sort_by_value(label_count_dict)

        print('length of list_label:',len(list_label))

        count = 0

        if use_seq2seq:
            i_list = [0,1,2]
            label_special_list = [_GO,_END,_PAD]
            for i, label in zip(i_list,label_special_list):
                word2index_label[label] = i
                index2word_label[i] = label

        for i, label in enumerate(list_label):
            if i < 10:
                count_value = label_count_dict[label]
                print('label:',label,'count_value:',count_value)
                count = count + count_value
            index = i + 3 if use_seq2seq else i
            word2index_label[label] = index
            index2word_label[index] = label
        print('count top10:',count)

        if not os.path.exists(cache_path):
            with open(cache_path,'ab') as data_f:
                pickle.dump((word2index_label,index2word_label),data_f)
    print('create_vocabulary_label_sorted.ended.len of vocvabulary_label:',len(index2word_label))
    return word2index_label,index2word_label
def ReadFileSentiWordNet(filename):
    senti_word = []
    senti_pos = []
    senti_neg = []
    file = codes.open(filename, 'r', 'utf-8')
    full_data = file.read().splitlines()
    for i in range(len(full_data)):  # Với mỗi dòng trong sentiwordnet
        columns = full_data[i].split('\t')

        words = columns[4].split(' ')
        # Xét mỗi từ
        for i in range(len(words)):
            # Bỏ 2 ký tự cuối
            words[i] = words[i][:-2]
            # Xét coi có trong senti_word chưa, nếu chưa có thêm vào
            if (words[i] not in senti_word):
                senti_word.append(words[i])
                senti_pos.append(float(columns[2]))
                senti_neg.append(float(columns[3]))
    return senti_word, senti_pos, senti_neg
示例#4
0
def uopen(*args):
    return open(*args,encoding="UTF-8")
示例#5
0
from setuptools import setup
from codes import open
from os import path

here = path.abspath(path.dirname(__file__))

with open(path.join(here, 'README.md'), encoding='utf-8') as f:
    long_description = f.read()

setup(
    name='easy_contract',
    version='0.2.0',
    description='Easy contract-based programming',
    long_description=long_description,
    long_description_content_type='text/markdown',
    url='https://github.com/fxcqz/easy_contract',
    author='Matt Rawcliffe',
    classifiers=[
        'Development Status :: 3 - Alpha',
        'Intended Audience :: Developers',
        'Topic :: Software Development :: Quality Assurance',
        'License :: OSI Approved :: MIT License',
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.6',
    ],
    keywords='contract programming',
    py_modules=['easy_contract', 'test_contract'],
)
示例#6
0
文件: setup.py 项目: jjh2kiss/shark
from setuptools import setup, find_packages
from codes import open

import os

here = os.path.abspath(os.path.dirname(__file__))

with open(os.path.join(here, 'README.rst', encoding='utf-8') as f:
    long_description = f.read()

https://github.com/pypa/sampleproject/blob/master/setup.py