def is_visited(question_id): visited = True if question_id is not None: try: visited = (Session.query(Question).filter_by( question_id=question_id).count() != 0) except: logger.error('fail to query question_id %s', question_id, exc_info=True) return visited
def save_analyzed_result(md5_string, result_json): ltp_result = LtpResult(md5_string, json.dumps(result_json, ensure_ascii=False)) Session.add(ltp_result) logger.info('start to insert ltp result, md5=%s', md5_string) try: Session.commit() except Exception: Session.rollback() logger.error('fail to insert', exc_info=True) logger.info('finished inserting ltp result')
def get_analyzed_result(question_text): if question_text is None: return None md5_string = md5(question_text) ltp_result = Session.query(LtpResult).filter_by(md5=md5_string).first() if ltp_result is not None: analyzed_result = AnalyzedSentence(md5_string, ltp_result.json_text) else: try: result_json = analyze(question_text) except RuntimeError: logger.error('fail to invoke ltp api, text=%s', question_text, exc_info=True) raise RuntimeError() save_analyzed_result(md5_string, result_json) analyzed_result = AnalyzedSentence(md5_string, result_json) return analyzed_result
def k_fold_cross_dataset(k, num): """Generate k-fold cross test set and train set. Example: In: k_fold_cross_dataset(2, 10) Out: [ { 'test_text': 'data/2-fold-cross-10-test-text-1.txt', 'test_label': 'data/2-fold-cross-10-test-label-1.txt', 'train_text': 'data/2-fold-cross-10-train-text-1.txt', 'train_label': 'data/2-fold-cross-10-train-label-1.txt', }, { 'test_text': 'data/2-fold-cross-10-test-text-2.txt', 'test_label': 'data/2-fold-cross-10-test-label-2.txt', 'train_text': 'data/2-fold-cross-10-train-text-2.txt', 'train_label': 'data/2-fold-cross-10-train-label-2.txt', } ] :param k: int :param num: int :return: list :raise RuntimeError: """ prefix = 'data/{k}-fold-cross-{num}'.format(k=k, num=num) file_pattern = '{prefix}-{{type}}-{{{{i}}}}.txt'.format(prefix=prefix) test_text_file_pattern = file_pattern.format(type='test-text') test_label_file_pattern = file_pattern.format(type='test-label') train_text_file_pattern = file_pattern.format(type='train-text') train_label_file_pattern = file_pattern.format(type='train-label') file_names = [] for i in range(0, k): test_text_file = test_text_file_pattern.format(i=i) test_label_file = test_label_file_pattern.format(i=i) train_text_file = train_text_file_pattern.format(i=i) train_label_file = train_label_file_pattern.format(i=i) file_names.append({ 'test_text': test_text_file, 'test_label': test_label_file, 'train_text': train_text_file, 'train_label': train_label_file}) exist = True for i in file_names: for file_ in i.itervalues(): if not os.path.isfile(file_): exist = False break if not exist: filtered_paragraphs = Session.query(FilteredParagraph).limit(num).all() if len(filtered_paragraphs) != num: raise RuntimeError() random.shuffle(filtered_paragraphs) folds = [[] for i in range(0, k)] for i in range(0, num): folds[i % k].append(filtered_paragraphs[i].paragraph) for i in range(0, k): test_text_file = file_names[i]['test_text'] test_label_file = file_names[i]['test_label'] train_text_file = file_names[i]['train_text'] train_label_file = file_names[i]['train_label'] test_set = folds[i] train_set = [] for j in range(0, k): if j != i: train_set.extend(folds[j]) # generate test set generate_dataset(test_set, test_text_file, test_label_file) # generate train set generate_dataset(train_set, train_text_file, train_label_file) return file_names
#!/usr/bin/env python # coding: utf-8 __author__ = 'wilfredwang' from data_access import Session from data_access import Paragraph import traceback with open('data/baidu-zhidao-paragraph.txt', 'wb') as f: count = 0 try: for paragraph in Session.query(Paragraph): lines = [paragraph.question.title, '\n'] for reply in paragraph.reply: lines.append(reply.content) lines.append('\n') lines.append('\n') f.writelines([s.encode('utf-8') for s in lines]) count += 1 print count except: print 'error, count %d' % count traceback.print_exc()
def extract(self, target): logger.info('check whether visited') matched_result = re.findall(r'/(\d+).html', target) if len(matched_result) == 0: logger.error('invalid question page url %s', target) return question_id = matched_result[0] if is_visited(question_id): logger.info('%s is visited, skip', question_id) return page = self.get_page(target, delay=True) if page is None: logger.info('page is none, skip') return # save question anchor = page.find('a', {'alog-alias': 'qb-class-info'}) if anchor is None: if page.find('title', text=u'百度--您的访问出错了') is None: logger.error('invalid question page %s', target) else: logger.error('auth page, set exit signal') self.exit_signal.set() return category_url = to_unicode(anchor['href']) category_id = re.findall(r'/(\d+)', category_url)[0] title = get_title(page) if title is None: logger.error('fail to get title in %s', target) return question = Question(question_id, category_id, title) Session.add(question) logger.info('start to insert %s', question) try: Session.commit() except: logger.error('fail to insert %s, rollback', question, exc_info=True) Session.rollback() return logger.info('finished inserting question') while not self.exit_signal.isSet() and page: for line_content_div in page.find_all('div', 'line content'): # answer only, skip if line_content_div.find('dt', 'ask f-12 grid') is None: continue # generate paragraph paragraph = Paragraph(question_id) # generate reply a_content = line_content_div.find('pre', {'accuse': 'aContent'}) if a_content is None: logger.error('can not find aContent, structure changed') break reply = to_unicode(a_content.strings) paragraph.replies.append(Reply(1, reply)) for pre in line_content_div.find_all('pre'): pre_accuse = pre.get('accuse', 'no') if pre_accuse == 'aRA': reply = to_unicode(pre.strings) paragraph.replies.append(Reply(1, reply)) elif pre_accuse == 'qRA': reply = to_unicode(pre.strings) paragraph.replies.append(Reply(0, reply)) Session.add(paragraph) logger.info('start to insert paragraph(%d replies)', len(paragraph.replies)) try: Session.commit() except: logger.error('fail to insert %s, rollback', paragraph, exc_info=True) Session.rollback() logger.info('finished inserting paragraph') next_page_link = get_next_page_link(page) page = self.get_page(next_page_link, delay=True) logger.info('finished extracting paragraph in %s', target)
def cleanup(self): Session.remove()