def get_train_teem_data(question_len=20): questions, subjects, predicates = readTrainData() qs = [] labels = [] for q, sub in zip(questions, subjects): question = re.split("\\s+", q.lower()) subject = re.split("\\s+", sub.lower()) qtemp = q.lower().replace(" ", "") stemp = sub.lower().replace(" ", "") if not stemp in qtemp: print(qtemp, stemp) stemp = subject[0] i = qtemp.index(stemp) prefix = q.lower().replace(" ", "")[:i] pre = "" for s in range(len(question)): pre += question[s] if prefix == pre: s += 1 break if prefix in pre: break e = s + len(subject) label = np.zeros(question_len) label[s:e] = 1 qidx = [vocab.get(word, 0) for word in question] qs.append(qidx) labels.append(label) return padding(qs, maxlen=question_len, value=1), padding(labels, maxlen=question_len, value=0)
def get_train_teem_data(question_len=50): questions, subjects, predicates = readTrainData() qs = [] labels = [] for question, subject in zip(questions, subjects): question = question.lower().replace(" ", "") subject = subject.lower().replace(" ", "") pattern = question.replace(subject, "X" * len(subject)) label = [int(w == "X") for w in pattern] qidx = [vocab.get(w, 0) for w in question] sidx = [vocab.get(w, 0) for w in subject] qs.append(qidx) labels.append(label) return padding(qs, maxlen=question_len, value=1), padding(labels, maxlen=question_len, value=0)
def get_test_teem_data(questions, question_len=20): qs = [] for q in questions: question = re.split("\\s+", q.lower()) qidx = [vocab.get(w, 0) for w in question] qs.append(qidx) return padding(qs, maxlen=question_len, value=1)
def get_test_teem_data(questions, question_len=50): qs = [] for question in questions: question = question.lower().replace(" ", "") qidx = [vocab.get(w, 0) for w in question] qs.append(qidx) return padding(qs, maxlen=question_len, value=1)
def load_questions(vocab=vocab, path=config.seg_test_question_path): '''加载问题''' f = open(path, encoding="utf-8") questions = [] for line in f: question = list(line.strip().replace(" ", "")) q_seq = [] for q in question: q_seq.append(vocab.get(q, 0)) questions.append(q_seq) return padding(questions, 50, value=1)
def load_predicates(vocab=vocab, path=config.all_predicate_path): '''加载predicate''' f = open(path, encoding="utf-8") predicates = [] for line in f: predicate = list(line.strip().replace(" ", "")) p_seq = [] for w in predicate: i = vocab.get(w, 0) p_seq.append(i) predicates.append(p_seq) return padding(predicates, maxlen=20, value=1) #padding(datas,maxlen=max_len,value=1)