def chequear_plagio(plagio_algorithm,doc1,doc2,file,trained_model,UMBRAL_SIMILARIDAD=0.9): doc1 = ut.preprocess_text(doc1) doc2 = ut.preprocess_text(doc2) if plagio_algorithm == 'A': return chequear_plagio_word2vec(doc1,doc2,file,trained_model,UMBRAL_SIMILARIDAD) else: return chequear_plagio_wordnet(doc1,doc2,file,trained_model,UMBRAL_SIMILARIDAD)
def generate_ctx_emission_table(lines, mode="en", ctx_mode="prev_word", lower=False, norm_tense=False, replace_number=False, replace_year=False, replace_symbol=False): hashmap = {} Y = {} skipped = [] word_freq = {"##UNK##": 0} for ln, line in enumerate(lines): try: x, _ = line.split(" ") if lines[ln - 1] == "": y = "##START##" else: if ctx_mode == "prev_word": y, _ = lines[ln - 1].split(" ") else: _, y = lines[ln - 1].split(" ") # x is the word, y is the POS of the prev word y = utils.preprocess_text(y, mode, lower, norm_tense, replace_number, replace_year, replace_symbol) x = utils.preprocess_text(x, mode, lower, norm_tense, replace_number, replace_year, replace_symbol) if x in word_freq: word_freq[x] += 1 else: word_freq[x] = 1 if y in hashmap: if x in hashmap[y]: hashmap[y][x] += 1 else: hashmap[y][x] = 1 else: hashmap[y] = {} hashmap[y][x] = 1 if y in Y: Y[y] += 1 else: Y[y] = 1 except Exception as e: if line not in skipped: # print(e) skipped.append(line) #print("Skipped", len(skipped), "lines: ", skipped) return {"x_hashmap": hashmap, "x_word_freq": word_freq, "y_tags": Y}
def classify(opts): ''' model and vectorizer must be stored in opts.model_dir directory ''' prop_names = read_prop_names(opts.prop_names_file) models = [] vects = [] #vectorizers for name in prop_names: model_file = opts.model_dir + "/" + name + ".model" vect_file = opts.model_dir + "/" + name + ".vect" if os.path.exists(model_file): print("Loading " + model_file) model = pickle.load(open(model_file, "rb")) print("Loading " + vect_file) vect = pickle.load(open(vect_file, "rb")) models.append(model) vects.append(vect) print("Classifying") out = open(opts.classify_outfile, "w") with open(opts.index_file) as lines: for line in lines: obj = json.loads(line) processed_sentences = [ utils.preprocess_text(i) for i in obj['text'] ] obj['scores'] = [] for i in range(len(models)): X = vects[i].transform(processed_sentences) scores = models[i].decision_function(X) obj['scores'].append(scores.tolist()) out.write(json.dumps(obj) + "\n") out.close()
def predict_rating(review_text, classifier, vectorizer, decision_threshold=0.5): """ - Given the classifier, vectorizer and text, classify whether that text is a positive or negative review args: review_text(str): The review that needs to be classified classifier(ReviewClassifier): The model that has been trained for classification vectorizer(utils.Vectorizer): The Vectorizer that will be used to convert the text to a vector returns: class(str): The class, which is either positive or negative """ review_text = preprocess_text(review_text) review_vector_np = vectorizer.vectorize(review_text) review_vector = torch.from_numpy(review_vector_np) result = torch.sigmoid(classifier(review_vector.view(1, -1))) class_label = None if result.item() < decision_threshold: class_label = 0 else: class_label = 1 return vectorizer.rating_vocab.lookup_index(class_label)
def searchresults(search_string, num_results): search_string = preprocess_text(search_string) search_vect = np.array([question_to_vec(search_string, w2v_model)]) search_results = [] cosine_similarities = pd.Series(cosine_similarity(search_vect, all_title_embeddings)[0]) #cosine_similarities = cosine_similarities*(0.4*data.overall_scores + 0.1*(data.sentiment_polarity)) for i,j in cosine_similarities.nlargest(int(num_results)).iteritems(): output = '' for t in data.question_content[i][:200].split(): if t.lower() in search_string: output += " " + str(t) else: output += " "+str(t) temp = { 'Title': str(data.original_title[i]), 'url': str(data.question_url[i]), 'Id': str(i), 'answer': str(data.answers_content[i]), 'Tags': str(data.tags[i]), 'similarity_score': str(j)[:5], 'votes': str(data.overall_scores[i]), 'Body':str(output) } search_results.append(temp) return search_results
def fit_transform(self, texts): clean_texts = [' '.join(preprocess_text(t)) for t in texts] transformer = CountVectorizer(min_df=3, max_df=0.7, preprocessor=lambda x: x, tokenizer=lambda t: t.split()) return np.array(transformer.fit_transform(clean_texts).todense())
def _lcs_match(max_dist): f.fill(0) g.clear() ### longest common sub sequence # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j)) for i in range(N): # note(zhiliny): # unlike standard LCS, this is specifically optimized for the setting # because the mismatch between sentence pieces and original text will # be small for j in range(i - max_dist, i + max_dist): if j >= M or j < 0: continue if i > 0: g[(i, j)] = 0 f[i, j] = f[i - 1, j] if j > 0 and f[i, j - 1] > f[i, j]: g[(i, j)] = 1 f[i, j] = f[i, j - 1] f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0 if (preprocess_text(paragraph_text[i], lower=config.uncased, remove_space=False) == tok_cat_text[j] and f_prev + 1 > f[i, j]): g[(i, j)] = 2 f[i, j] = f_prev + 1
def fit_transform(self, texts): clean_texts = [preprocess_text(t) for t in texts] if self.train: size = self.d self.model = Word2Vec(size=size, workers=8, min_count=3) self.model.build_vocab(clean_texts) self.model.train(clean_texts, total_examples=len(clean_texts), epochs=10) else: size = 300 self.model = api.load('word2vec-google-news-300') embs = [] for text in clean_texts: emb = np.zeros(size) n = 0 for w in text: if w in self.model: emb += self.model[w] n += 1 if n != 0: emb = emb / n embs.append(emb) sentence_embeddings = np.array(embs) return sentence_embeddings
def predict_sentences_2_idxs(self): """Replaces each Quora question with indexes corressponding to respective position of tokens in embedding matrix. If include_unknown is true, then replaces with corressponding index, ignores otherwise. Creates 2 binary files: parsed_train_pos.txt: list of lists containing token indexes (integers) of positive class parsed_train_neg.txt: list of lists containing token indexes (integers) of negative class """ fo = open(self.config.parsed_predict_file, 'w') self.load_dicts() questions = pd.read_csv(self.config.predict_file, usecols=["question_text"], index_col=False) unk_idx = self.word2idx[self.config.unknown_token] for quest in questions.question_text: tokens = utils.preprocess_text(quest) if self.config.include_unknown: idxs = [self.word2idx.get(token, unk_idx) for token in tokens] else: idxs = [self.word2idx.get(token) for token in tokens] idxs = [idx for idx in idxs if idx] fo.write((str(" ".join(str(num) for num in idxs)) + "\n"))
def clasificar_documento(doc, treshold=0.3, categories=[ 'economy', 'technology', 'health', 'science-environment', 'business', 'politics', 'entertainment', 'sport' ]): try: doc_english = translator.translate(ut.preprocess_text(doc)).text sentences = tokenize.sent_tokenize(doc_english) sum_scores = inicializar_scores(categories) for sent in sentences[5:10]: url = 'https://api.dandelion.eu/datatxt/cl/v1' payload = { 'text': sent, 'model': "54cf2e1c-e48a-4c14-bb96-31dc11f84eac", 'token': 'cbbf951e9b704ea4a3ddfd09d27bed1d', 'min_score': treshold } jsonData = requests.get(url, params=payload).json() sum_scores = update_scores(sum_scores, jsonData) return get_argmax(sum_scores) except: print("Error al categorizar el texto.")
def main(): args = parser.parse_args() src = Path(args.source_dir) dest = Path(args.destination_dir) dataset_names = (args.test_dir_name, args.train_dir_name) print("Converting raw dataset to rows...\n") row_dict = utils.raw_dataset_to_row_dict( dataset_root_path=src, dataset_dir_names=dataset_names, ) for dataset_name, rows in row_dict.items(): for i, (text, label) in tqdm(enumerate(rows), total=len(rows), desc="Preprocessing text"): row_dict[dataset_name][i] = (utils.preprocess_text(text), label) print() if not os.path.exists(dest): print(f"Creating directory {dest} since it doesn't exist...") os.makedirs(dest) print("Saving datasets as TSV...") for dataset_name in dataset_names: dataset_dest = dest / f"{dataset_name}.tsv" with open(dataset_dest, mode="w+", encoding="utf8", errors="replace") as f: csv.writer(f, delimiter="\t").writerows(row_dict[dataset_name]) print("Done!\n")
def make_dataset(params): strings = re.compile('[^a-zA-Z]') data_dir = os.path.join('./processed', params["directory"]) tags = os.listdir(data_dir) if params["tags"] is not None: tags = params["tags"].split(";") if "unclassified" in tags: tags.remove("unclassified") if len(tags) < 2: return None db = Database() texts = [] labels = [] for i, tag in enumerate(tags): class_dir = os.path.join(data_dir, tag) ids = os.listdir(class_dir) for id in ids: data, status = db.fetch_document(id) if data: try: json_path = os.path.join(data['processed_path'], data['id'] + '.json') with open(json_path, 'r') as fi: data['content'] = json.loads(fi.read()) doc = Document(data) text = doc.get_text() text = utils.preprocess_text(text) texts.append(text) labels.append(i) except Exception as e: pass train_x, test_x, train_y, test_y = train_test_split( texts, labels, test_size=params["split"]) return train_x, test_x, train_y, test_y, tags
def evaluate_line(self, line): if isinstance(line, str): raw_test_comments = [line] elif isinstance(line, (list, tuple)): raw_test_comments = [question for question, entity_dict in line] else: raise ValueError('【格式错误】question 字段值应该为字符串或列表!') processed_test_comments = [] for comment in raw_test_comments: processed_test_comments.append(preprocess_text(comment)) test_sequences = self.tokenizer.texts_to_sequences( processed_test_comments) final_test_data = pad_sequences(test_sequences, maxlen=150) rets = self.model.predict(x=final_test_data, batch_size=1) ret = [] for pred, question in zip(rets, raw_test_comments): # argsort函数返回的是数组值从小到大的索引值 sort_index = pred.argsort() pred_ret = [{ 'question': question, 'intent': self.id2label[_index], 'score': float(pred[_index]) } for _index in sort_index[-5:][::-1]] ret.append(pred_ret) # label = self.id2label[pred.argmax()] # score = float(pred.max()) # ret.append([{'question': question, 'intent': label, 'score': score}]) logger.info("问句`{}`实体识别的结果:{}".format(line, ret)) return ret
def explain(self, text, nwords, return_weights=False): ''' Use `LimeTextExplainer` to obtain the top `nwords` most important/polar words in the `text` as an explanation. Parameters -------------- text: str The text to explain. nwords: int The number of most important words to return (i.e. explanation size). return_weights: bool Set to True to return the weights assigned by LIME also. Returns --------------- word_ranking : list Indexes of the `nwords` top-ranked words in the text. ranked_words: list List of `nwords` top-ranked words in the text. weights: dict, optional The dictionary of weights (wordposition -> weight) assigned by LIME to the words in the text. explanation: optional The explanation object returned by `LimeTextExplainer`. ''' text = preprocess_text(text) text_words = get_tokens(text) class_names = ['negative', 'positive'] # bow is set to False because word order is important explainer = LimeTextExplainer(class_names=class_names, feature_selection='auto', bow=False, split_expression=' ', verbose=False) explanation = explainer.explain_instance( text_instance=text, labels=[0, 1], classifier_fn=self.predict_texts, num_features=nwords, num_samples=self.nsamples) # sort weights by decreasing absolute value weights = OrderedDict( sorted(explanation.as_map()[1], key=lambda weight: -abs(weight[1]))) word_ranking = np.array(list(weights.keys())) ranked_words = [text_words[i] for i in word_ranking] if return_weights: return word_ranking, ranked_words, weights, explanation return word_ranking, ranked_words
def pico_preprocess(line): line = dict(text=line.abstract, P=line.population, I=line.intervention, O=line.outcome) if pico_constraint(line): return {k: preprocess_text(v) for k, v in line.items()} else: return line
def explain_text_words(self, text, rank_by_importance=True): ''' Word level explanation. ''' text = preprocess_text(text) text_words = get_tokens(text) y = self.model.predict_class(text) word_ranking, values = self.sbe(text_words, y, rank_by_importance) ranked_words = [text_words[i] for i in word_ranking] return word_ranking, ranked_words, values
def getsearchresults(): params = request.json if (params == None): params = request.args query = params["query"] query = preprocess_text(query) tags = list(predict_tags(query)) results = searchresults(query, params["num_results"]) return jsonify({'tags': tags, 'results': results})
def attack(self,text, target_class, search_algorithm, random_attack = False): ''' Attack text to change the prediction to `target_class`. Parameters ----------------- text: str The text to attack. target_class: int The class to change the classification to. search_algorithm: str The search algorithm to use in attack the text : greedy or beam. random_attack: bool, optional Randomly selects words to target for attack ''' text = preprocess_text(text) x = get_tokens(text) explanation_size = int(self.percentage * len(x)) if self.explainer is None : # target all words print("No explainer provided . Targeting all words in the input ... ") candidate_words_indexes = np.arange(len(x)) candidate_words = np.array(x)[candidate_words_indexes].tolist() elif not random_attack : print('Generating explanation...') candidate_words_indexes, candidate_words = self.explainer.explain(text, explanation_size) else : print("Randomly selecting candidate words to perturb...") candidate_words_indexes = np.random.choice(len(x), explanation_size , replace = False) candidate_words = np.array(x)[candidate_words_indexes].tolist() assert len(candidate_words_indexes) == len(candidate_words) print("Extracted candidate words: ", candidate_words) synonyms_map = self.build_synonyms_map(candidate_words) print("Built synonyms map.") candidate_replacements = self.get_valid_replacements(x, candidate_words_indexes, synonyms_map) print("Filtered replacements.") Attacker.print_candidate_stats(candidate_replacements) #print("candidate_replacements: ") #pprint(candidate_replacements) if search_algorithm == 'greedy': print('Running greedy search...') used_replacements, adversary_found, prediction = self.greedy_search(x,candidate_replacements, target_class) elif search_algorithm == 'beam': print('Running beam search...') used_replacements, adversary_found, prediction = self.beam_search(x, candidate_replacements, target_class) else : raise ValueError('Invalid search algorithm provided') print("Chose replacements.") # Generate adversarial text adv_text = Attacker.get_adv_text(text, used_replacements) return used_replacements, adversary_found, adv_text, prediction
def process(fipc): global index for line in fipc: line = line.strip().split() ipc = line[0].decode('utf-8') desc = " ".join(line[1:]) normalized = utils.preprocess_text(desc.decode('utf-8')).keys() for w in normalized: ipclist = index.get(w, []) ipclist.append(ipc) index[w] = ipclist
def text2idx(self, input_text): tokens = utils.preprocess_text(input_text) if self.config.include_unknown: idxs = [self.word2idx.get(token, self.unk_idx) for token in tokens] else: idxs = [self.word2idx.get(token) for token in tokens] idxs = [idx for idx in idxs if idx] return np.array(idxs)
def predict(self, text): """ Predicts class of text and returns the label prediction and the model probability of the predicted label. If multiple text items are passed, the method returns a tupel containing two arrays -- array one contains predicted labels and array two contains predicted label probabilities. Arguments: text (str or list): text to be classified. """ # If self.clean_text_ apply preprocess_text function according to type. if self.clean_text_: if type(text) == str: text = preprocess_text(text) else: text = [preprocess_text(item_text) for item_text in text] # Predict and return text label and probability. return self.model_.predict(text)
def collect_quotes(quotes): """Structure final quotes as a list of records for display in a table.""" collection = [] for q in quotes: # Checking for 'PERSON' before assigning a speaker - if the quote is of type 'Heuristic', # the conditions are relaxed and we accept the quote with a blank speaker name if q.get('named_entity_type') == 'PERSON' or q.get( 'quote_type') == 'Heuristic': speaker = q.get('named_entity', "") quote = preprocess_text(q.get('quote', "")) collection.append({'speaker': speaker, 'quote': quote}) return collection
def deanonymize_dataset( rg_path: str, standardized_dataset: Dataset, processed_dataset_path: str = None, n_samples: int = None, ): """Take an anonymized dataset and add back the original dataset columns.""" assert processed_dataset_path is not None, \ "Please specify a path to save the dataset." # Load the dataset dataset = Dataset.load_from_disk(rg_path) if n_samples: dataset.set_visible_rows(list(range(n_samples))) standardized_dataset.set_visible_rows(list(range(n_samples))) text_columns = [] # Add columns from the standardized dataset dataset.add_column('document', standardized_dataset['document']) text_columns.append('document') if 'summary:reference' in standardized_dataset.column_names: dataset.add_column('summary:reference', standardized_dataset['summary:reference']) text_columns.append('summary:reference') # Preprocessing all the text columns dataset = dataset.update( lambda x: {f'preprocessed_{k}': preprocess_text(x[k]) for k in text_columns}) # Run the Spacy pipeline on all preprocessed text columns try: nlp = load('en_core_web_lg') except OSError: nlp = load('en_core_web_sm') nlp.add_pipe('sentencizer', before="parser") spacy = Spacy(nlp=nlp) dataset = spacy( dataset, [f'preprocessed_{col}' for col in text_columns], batch_size=100, ) # Directly save to disk dataset.save_to_disk(processed_dataset_path) return dataset
def train(self, train_set: pd.DataFrame, force: bool = False, save: bool = True) -> None: if not force and self.LogReg_pipeline is not None: return self.LogReg_pipeline = Pipeline([ ('tfidf', TfidfVectorizer(stop_words=self.stop_words)), ('clf', LogisticRegression(solver='sag')), ]) self.LogReg_pipeline.fit(train_set['comment_text'].map(lambda com: utils.preprocess_text(com)), train_set['bannable']) if save: utils.dump(self.LogReg_pipeline, "log_pipeline")
def count_missing_words(): data_model = QuoraQuestionsModel(DataConfig(), 10, 10, 300) data_model.load_dicts() questions = pd.read_csv(data_model.config.train_file, usecols=["question_text"], index_col=False) word_count = defaultdict(int) for quest in questions.question_text: tokens = preprocess_text(quest) for token in tokens: if token not in data_model.word2idx: word_count[token] += 1 fo = open("../data/missing_word_counts_new.txt", "wb") pickle.Pickler(fo, 4).dump(word_count) fo.close()
def parse_doc(collection, doc): """Perform quote extraction conditionally on one document""" try: doc_id = str(doc['_id']) if doc is None: app_logger.error('Document "{0}" not found.'.format(doc_id)) else: text = doc['body'] text_length = len(text) if text_length > MAX_BODY_LENGTH: app_logger.warn( 'Skipping document {0} due to long length {1} characters'.format(doc['_id'], text_length)) if update_db: collection.update( {'_id': ObjectId(doc_id)}, { '$set': { 'lastModifier': 'max_body_len', 'lastModified': datetime.now() }, '$unset': { 'quotes': 1 } }, upsert=True ) # Process document doc_text = utils.preprocess_text(doc['body']) spacy_doc = nlp(doc_text) quotes = extract_quotes(doc_id=doc_id, doc=spacy_doc, write_tree=write_quote_trees_in_file) if update_db: collection.update( {'_id': ObjectId(doc_id)}, {'$set': { 'quotes': quotes, 'lastModifier': 'quote_extractor', 'lastModified': datetime.now()}}) else: # If dry run, then display extracted quotes (for testing) print('=' * 20, ' Quotes ', '=' * 20) for q in quotes: print(q, '\n') except: app_logger.exception("message") traceback.print_exc()
def fix(self, text, target_class, beam_size = 4, random_fix = False): ''' Change the classification of a text to the correct class. Parameters ------------ text: str The text that is misclassified. target_class: int The label of the class to change the prediction to beam_size: int random_fix: Boolean, Optional If set to True, words will be targeted randomly for replacement. Returns ---------------- suggestions: list The list of suggested replacement sets. ''' text = preprocess_text(text) x = get_tokens(text) explanation_size = int(self.percentage * len(x)) if self.explainer is None : # target all words print("No explainer provided . Targeting all words in the input ... ") candidate_words_indexes = np.arange(len(x)) candidate_words = np.array(x)[candidate_words_indexes].tolist() elif not random_fix : print('Generating explanation...') candidate_words_indexes, candidate_words = self.explainer.explain(text, explanation_size) else : print("Randomly selecting candidate words to perturb...") candidate_words_indexes = np.random.choice(len(x), explanation_size , replace = False) candidate_words = np.array(x)[candidate_words_indexes].tolist() print("Extracted candidate words: ", candidate_words) synonyms_map = self.build_synonyms_map(candidate_words) print("Built synonyms map.") candidate_replacements = self.get_valid_replacements(x, candidate_words_indexes, synonyms_map) print('Filtered replacements.') print('Running beam search...') suggestions = self.beam_search(x, candidate_replacements, target_class, beam_size = beam_size, return_multiple = True) return suggestions
def similarity_check(ques_text, ref_ques_dict): ''' for a given ques text, checks if its sum of similarity scores with all reference questions is less than min score, if yes takes each word from the ques and finds similar words, also tries spell check and returns a list of possible words similar to original ques text rtype: list of str ''' if not ques_text: return '' # remove trailing spaces, spl characters ques_text = preprocess_text(ques_text) sim_words = [] # get the similarity scores from # the string matching algorithms sim_scores = get_sim_ref_ques(ques_text) if sum(sim_scores.values()) <= min_sim_score: # if the similarity score is very low # lower than minimum accepted, try # finding similar words as a fallback # option #1 sim_words = get_similar_words_sent(ques_text, max_sim_words, min_count) if not sim_words: # if there are no similar words # there is a possibility that # this word has been misspelled # try to do a spell check spelled = spell_check(ques_text) sim_words = [spelled] if sim_words: # if either the bigrams from gensim # word vec or spell check generated # similar words, add to the original # question text ques_text = str(ques_text).replace('"', '') new_text = str(" ".join(sim_words)) ques_text = ques_text + " " + new_text return ques_text
def predict(self, tweet, seq_length): tweet = preprocess_tweet(tweet, punctuation=True) tweet = preprocess_text(tweet) tokens = [tokenize_custom(tweet, self.vocab_to_int)] features = pad_features(tokens, seq_length=seq_length) self.cuda() with torch.no_grad(): h = self.init_hidden(1) output, h = self( torch.from_numpy(features).type(torch.cuda.LongTensor), h) softmax = nn.Softmax(dim=1) return softmax(output).cpu().numpy()
def predict(): checkpoint_file = os.path.join(MODEL_PATH, CHECKPOINT_FILE) classes_to_labels_flie = os.path.join(MODEL_PATH, LABELS_FILE) embedding_matrix_file = os.path.join(MODEL_PATH, EMBEDDING_MATRIX_FILE) # model_file = os.path.join(MODEL_PATH, 'model.pkl') tokenizer_file = os.path.join(MODEL_PATH, TOKENIZER_FILE) predicate_label = pickle.load(open(classes_to_labels_flie, 'rb'), encoding="iso-8859-1") embedding_matrix = pickle.load(open(embedding_matrix_file, 'rb'), encoding="iso-8859-1") # model = pickle.load(open(model_file, 'rb'), encoding="iso-8859-1") nb_words, EMBEDDING_DIM = embedding_matrix.shape label2id = {k: t.argmax() for k, t in predicate_label.items()} id2label = {_id: label for label, _id in label2id.items()} model = make_model(nb_words, EMBEDDING_DIM, embedding_matrix, len(predicate_label)) model.load_weights(checkpoint_file) # model = load_model(checkpoint_file) tokenizer = pickle.load(open(tokenizer_file, 'rb'), encoding="iso-8859-1") test_data = read_data(DEV_FILE) raw_test_comments = [t[0] for t in test_data] test_y = np.array([predicate_label[t[1]] for t in test_data]) processed_test_comments = [] for comment in raw_test_comments: processed_test_comments.append(preprocess_text(comment)) test_sequences = tokenizer.texts_to_sequences(processed_test_comments) final_test_data = pad_sequences(test_sequences, maxlen=150) # print('test_data', test_data[:3]) print('模型评估') ret = model.predict(x=final_test_data, batch_size=1) # print('预测结果:', ret) # print('标注', '预测', '问题') rets = [] for label, pred, question in zip(test_y, ret, test_data): print(id2label[label.argmax()], id2label[pred.argmax()], question) rets.append([id2label[label.argmax()], id2label[pred.argmax()], question]) print('正确率:{}'.format(len([t for t in rets if t[0]==t[1]])/len(rets)))