def get_all_url(): """ too large to insert to db...so, can't use insert after query to cache it... :return: """ second_request = db.paper_url.find({"parentTag": "all"}) parent_tag_url_object = cnki_class.PaperURL([], "null", "all") for item in second_request: parent_tag_url_object.urls += item['urls'] result = parent_tag_url_object.to_dic() collection_utils.unique(result['urls'], lambda x, y: cmp(x, y)) return result
def get_url_by_parent_tag(parent_tag): first_request = get_url_by_tag(parent_tag) if not first_request: second_request = db.paper_url.find({"parentTag": parent_tag}) parent_tag_url_object = cnki_class.PaperURL([], parent_tag, "all") for item in second_request: parent_tag_url_object.urls += item['urls'] result = parent_tag_url_object.to_dic() collection_utils.unique(result['urls'], lambda x, y: cmp(x, y)) insert_url(result) return result else: return first_request
def build_paper_url(parent_tag_tag_tuple): """ :param parent_tag_tag_tuple: tuple :return: PaperURL Object or None """ parent_tag, tag = parent_tag_tag_tuple result = cnki_class.PaperURL([], tag, parent_tag) ctl, new_header, first_page_html = get_first_page(tag) item_num = find_item_num(first_page_html) page_num = calculate_page_num(item_num) if page_num == 1: paper_list = parse_url_list(first_page_html) result.urls = paper_list else: query_id = parse_query_id(first_page_html) for index in range(page_num): current_page = index + 1 ith_page = session.get(constants.ith_page_uri, params=build_ith_page_query( ctl, current_page, query_id), headers=new_header).text if is_check_code_page(ith_page): ctl, new_header, first_page_html = get_first_page(tag) ith_page = session.get(constants.ith_page_uri, params=build_ith_page_query( ctl, current_page, query_id), headers=new_header).text paper_list = parse_url_list(ith_page) result.urls += paper_list if len(result.urls) == 0: return None collection_utils.unique(result.urls, lambda x, y: cmp(x, y)) result.urls = [uri.replace('kns', 'KCMS') for uri in result.urls] return result
def build_paper_url(parent_tag_tag_tuple): """ :param parent_tag_tag_tuple: tuple :return: PaperURL Object or None """ parent_tag, tag = parent_tag_tag_tuple result = cnki_class.PaperURL([], tag, parent_tag) ctl, new_header, first_page_html = get_first_page(tag) item_num = find_item_num(first_page_html) page_num = calculate_page_num(item_num) if page_num == 1: paper_list = parse_url_list(first_page_html) result.urls = paper_list else: query_id = parse_query_id(first_page_html) for index in range(page_num): current_page = index + 1 ith_page = session.get(constants.ith_page_uri, params=build_ith_page_query(ctl, current_page, query_id), headers=new_header).text if is_check_code_page(ith_page): ctl, new_header, first_page_html = get_first_page(tag) ith_page = session.get(constants.ith_page_uri, params=build_ith_page_query(ctl, current_page, query_id), headers=new_header).text paper_list = parse_url_list(ith_page) result.urls += paper_list if len(result.urls) == 0: return None collection_utils.unique(result.urls, lambda x, y: cmp(x, y)) result.urls = [uri.replace('kns', 'KCMS') for uri in result.urls] return result
def cal_IDF(all_seq_data): """ :param all_seq_data: :return: """ cnki_logger.info("Start calculating IDF") all_words = [] for item in all_seq_data: all_words.extend(item) cnki_logger.info("Before unique: [%d]" % len(all_words)) all_words = collection_utils.unique(all_words, lambda x, y: cmp(x, y)) size = len(all_words) dump_words(all_words) count = collection_utils.list(size, 0) for row in all_seq_data: for word in row: index = collection_utils.binary_search(all_words, word) if index != -1: count[index] += 1 cnki_logger.info("End calculating IDF. words number is [%s]" % size) idf = [math.log(size * 1.0 / (i + 1)) for i in count] return all_words, idf
def reduce_repeat(): all = [i for i in mongo_utils.get_all_paper_detail()] collection_utils.unique( all, lambda x, y: cmp(x['name'] + x['title'], y['name'] + y['title'])) print(len(all)) mongo_utils.insert_reduce_paper_detail(all)
def reduce_repeat(): all = [i for i in mongo_utils.get_all_paper_detail()] collection_utils.unique(all, lambda x, y: cmp(x['name'] + x['title'], y['name'] + y['title'])) print(len(all)) mongo_utils.insert_reduce_paper_detail(all)