示例#1
0
def makeType(train, valid):
    global typelen
    global type2num, num2type
    global validl
    answertypelist = []
    trainl = ljqpy.LoadCSV(train)
    validl = ljqpy.LoadCSV(valid)
    for each in trainl:
        answertype = each[2]
        answertypelist.append(answertype)
    for each2 in validl:
        answertype2 = each2[2]
        answertypelist.append(answertype2)
    answertypelist = list(set(answertypelist))
    typelen = len(answertypelist)
    type2num = {y: x for x, y in enumerate(answertypelist)}
    num2type = {val: key for key, val in type2num.items()}
    qy = []
    for each in trainl:
        t = each[2]
        qy.append(type2num[t])
    tyq = []
    for each in validl:
        t = each[2]
        tyq.append(type2num[t])
    return np.array(qy), np.array(tyq)
示例#2
0
def MakeVocab():
    global id2w, w2id, id2c, c2id
    freqw = []
    freqc = []
    if os.path.exists(vocabFile):
        freqw = ljqpy.LoadCSV(vocabFile)
        freqc = ljqpy.LoadCSV(charFile)
    else:
        print('wordlist or charlist is not find')
    id2w = ['<PAD>', '<UNK>'] + [x[0] for x in freqw[:vocab_size]]
    w2id = {y: x for x, y in enumerate(id2w)}
    id2c = ['<PAD>', '<UNK>'] + [x[0] for x in freqc[:char_size]]
    c2id = {y: x for x, y in enumerate(id2c)}
def MakeS2SDict(fn=None, min_freq=5, delimiter=' ', dict_file=None):
    if dict_file is not None and os.path.exists(dict_file):
        print('loading', dict_file)
        lst = ljqpy.LoadList(dict_file)
        midpos = lst.index('<@@@>')
        itokens = TokenList(lst[:midpos])
        otokens = TokenList(lst[midpos + 1:])
        return itokens, otokens
    data = ljqpy.LoadCSV(fn)
    wdicts = [{}, {}]
    for ss in data:
        for seq, wd in zip(ss, wdicts):
            for w in seq.split(delimiter):
                wd[w] = wd.get(w, 0) + 1
    wlists = []
    for wd in wdicts:
        wd = ljqpy.FreqDict2List(wd)
        wlist = [x for x, y in wd if y >= min_freq]
        wlists.append(wlist)
    print('seq 1 words:', len(wlists[0]))
    print('seq 2 words:', len(wlists[1]))
    itokens = TokenList(wlists[0])
    otokens = TokenList(wlists[1])
    if dict_file is not None:
        ljqpy.SaveList(wlists[0] + ['<@@@>'] + wlists[1], dict_file)
    return itokens, otokens
 def Load(self):
     with h5py.File(self.h5name) as dfile:
         self.xQuestion = dfile['xQuestion'][:]
         self.xContext = dfile['xContext'][:]
         self.xQuestionC = dfile['xQuestionC'][:]
         self.xQuestionA = dfile['xQuestionA'][:]
         self.xContextC = dfile['xContextC'][:]
         self.xContextA = dfile['xContextA'][:]
         self.y_start = dfile['y_start'][:]
         self.y_end = dfile['y_end'][:]
         self.startEnd = dfile['startEnd'][:]
     data = ljqpy.LoadCSV(self.h5name + '.txt')
     self.questionId = [x[0] for x in data]
     self.questionRaw = [x[1] for x in data]
     self.realAnswer = [x[2] for x in data]
     if os.path.exists(self.h5name + '.c.txt'):
         self.contextRaw = ljqpy.LoadCSV(self.h5name + '.c.txt')
示例#5
0
def MakeOwnDatas(train):
    xQuestion = []
    xQuestionC = []
    for line in ljqpy.LoadCSV(train):
        question = line[0]
        questionTokens = CutSentence(question)
        xq = Tokens2Intlist(questionTokens, maxQLen)
        xqc = Chars2Intlist(questionTokens, maxQLen)
        xQuestion.append(xq)
        xQuestionC.append(xqc)
    return np.array(xQuestion), np.array(xQuestionC)
def MakeVocab():
    global id2w, w2id, id2c, c2id
    vocabFile = 'data/wordlist.txt'
    charFile = 'data/charlist.txt'
    if os.path.exists(vocabFile):
        freqw = ljqpy.LoadCSV(vocabFile)
        freqc = ljqpy.LoadCSV(charFile)
    else:
        freqw = {}
        freqc = {}
        for line in ljqpy.LoadCSVg(trainFile):
            line = ''.join(line)
            thisJson = json.loads(line.strip().lower())
            question = thisJson["query"]
            question = re.sub(r'\s+', ' ', question.strip())
            questionTokens = CutSentence(question)
            for t in questionTokens:
                for c in t:
                    freqc[c] = freqc.get(c, 0) + 10
                t = ChangeToken(t)
                freqw[t] = freqw.get(t, 0) + len(thisJson["passages"])
            for passage in thisJson["passages"]:
                context = passage["passage_text"]
                context = FullToHalf(context)
                context = re.sub(r'\s+', ' ', context.strip())
                contextTokens = CutSentence(context)
                for t in contextTokens:
                    for c in t:
                        freqc[c] = freqc.get(c, 0) + 1
                    t = ChangeToken(t)
                    freqw[t] = freqw.get(t, 0) + 1
        freqw = ljqpy.FreqDict2List(freqw)
        ljqpy.SaveCSV(freqw, vocabFile)
        freqc = ljqpy.FreqDict2List(freqc)
        ljqpy.SaveCSV(freqc, charFile)
    id2w = ['<PAD>', '<UNK>'] + [x[0] for x in freqw[:vocab_size]]
    w2id = {y: x for x, y in enumerate(id2w)}
    id2c = ['<PAD>', '<UNK>'] + [x[0] for x in freqc[:char_size]]
    c2id = {y: x for x, y in enumerate(id2c)}
def ReadQuestionAnswers():
    global qidAnswers
    fnlist = [
        './train_data/qid_answer_expand',
        './train_data/qid_answer_expand.valid'
    ]
    qidAnswers = {}
    for fn in fnlist:
        for tokens in ljqpy.LoadCSV(fn):
            if len(tokens) != 3: continue
            qid = tokens[0]
            answers = tokens[2].split('|')
            qidAnswers[qid] = set(answers)
示例#8
0
def MakeS2SDict(fn=None, min_freq=5, delimiter=' ', dict_file=None):
	'''
	构建input和output sequence的 word或char list
	:param fn: 
	:param min_freq: 
	:param delimiter: 
	:param dict_file: 
	:return: 
	'''
	# 如果有word/char list则不需要重新构建
	if dict_file is not None and os.path.exists(dict_file):
		print('loading', dict_file)
		lst = ljqpy.LoadList(dict_file)
		midpos = lst.index('<@@@>')
		itokens = TokenList(lst[:midpos])
		otokens = TokenList(lst[midpos+1:])
		return itokens, otokens
	# 如果没有则重新构建
	data = ljqpy.LoadCSV(fn)
	wdicts = [{}, {}]
	for ss in data:
		for seq, wd in zip(ss, wdicts):
			for w in seq.split(delimiter): 
				wd[w] = wd.get(w, 0) + 1  # nice code
	wlists = []
	for wd in wdicts:	
		wd = ljqpy.FreqDict2List(wd)
		wlist = [x for x,y in wd if y >= min_freq]
		wlists.append(wlist)
	print('seq 1 words:', len(wlists[0]))
	print('seq 2 words:', len(wlists[1]))
	itokens = TokenList(wlists[0])
	otokens = TokenList(wlists[1])
	if dict_file is not None:
		ljqpy.SaveList(wlists[0]+['<@@@>']+wlists[1], dict_file)
	return itokens, otokens
        print(x, y)
    print(
        s2s.decode_sequence_readout('A black dog eats food .'.split(),
                                    delimiter=' '))
    print(
        s2s.decode_sequence_fast('A black dog eats food .'.split(),
                                 delimiter=' '))
    while True:
        quest = input('> ')
        print(s2s.decode_sequence_fast(quest.split(), delimiter=' '))
        rets = s2s.beam_search(quest.split(), delimiter=' ')
        for x, y in rets:
            print(x, y)
elif 'test' in sys.argv:
    import ljqpy
    valids = ljqpy.LoadCSV('data/en2de.s2s.valid.txt')
    en = [x[0].split() for x in valids[:100]]
    rets = s2s.decode_sequence_readout(en, delimiter=' ')
    for x in rets[:5]:
        print(x)

    rets = s2s.beam_search(en, delimiter=' ', verbose=1)
    for i, x in enumerate(rets[:5]):
        print('-' * 20)
        print(valids[i][1])
        for y in x:
            print(y)

    rets = s2s.decode_sequence_fast(en, delimiter=' ', verbose=1)
    for x in rets[:5]:
        print(x)