def process_corpus(tr_in_filename, te_in_filename, u_in_filename, tr_out_filename, te_out_filename, u_out_filename): input_f = open(tr_in_filename, 'r') tr_original_corpus = pickle.load(input_f) input_f.close() input_f = open(te_in_filename, 'r') te_original_corpus = pickle.load(input_f) input_f.close() input_f = open(u_in_filename, 'r') u_original_corpus = pickle.load(input_f) input_f.close() tr_instances = [d['question'] for d in tr_original_corpus if '' not in d['target']] te_instances = [d['question'] for d in te_original_corpus if '' not in d['target']] u_instances = [d['question'] for d in u_original_corpus if ((not 'target' in d) or '' not in d['target'])] vect = get_features() vect.fit(tr_instances + te_instances + u_instances) v_instances = vect.transform(tr_instances + te_instances + u_instances) v_instances = csr_matrix(v_instances > 0, dtype=int8) print v_instances.shape tr_corpus = Corpus() tr_corpus.instances = v_instances[:len(tr_instances)] tr_corpus.full_targets = [d['target'] for d in tr_original_corpus if '' not in d['target']] tr_corpus.representations = [_get_repr(i[0]) for i in tr_instances] tr_corpus._features_vectorizer = vect tr_corpus.save_to_file(tr_out_filename) te_corpus = Corpus() te_corpus.instances = v_instances[:len(te_instances)] te_corpus.full_targets = [d['target'] for d in te_original_corpus if '' not in d['target']] te_corpus.representations = [_get_repr(i[0]) for i in te_instances] te_corpus._features_vectorizer = vect te_corpus.save_to_file(te_out_filename) u_corpus = Corpus() u_corpus.instances = v_instances[:len(u_instances)] u_corpus.full_targets = [d['target'] if ('target' in d and '' not in d['target']) else [] for d in u_original_corpus] u_corpus.representations = [_get_repr(i[0]) for i in u_instances] u_corpus._features_vectorizer = vect u_corpus.save_to_file(u_out_filename)