def predict(self): f = open("model/data_map.pkl", "rb") maps = cPickle.load(f) f.close() self.batch_size = 1 self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(FLAGS.spm) self.train_length = 10 self.tag_map = maps.get("tag_map", {}) self.nums_tags = len(self.tag_map.values()) self.__creat_model() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("[->] restore model") self.saver.restore(sess, ckpt.model_checkpoint_path) else: print("[->] no model, initializing") sess.run(tf.global_variables_initializer()) trans = self.trans.eval() dataset = [] with open('data/test_raw_big.txt', 'r', encoding="utf-8") as f: for data in f.readlines(): ent, raw_con, info = data.replace('\n', '').split('\t') dataset.append([ent, raw_con, info]) f.close() for ele in dataset: info = ele[2] text = info feed = self.prepare_xlnet_pred_data(text) paths, length = sess.run([self.pred_ids, self.length], feed_dict=feed) print(format_tags(paths[0], self.tag_map)) org = get_tags(paths[0], "", self.tag_map) org_entity = format_result(org, text, "") per = get_tags(paths[0], "", self.tag_map) per_entity = format_result(per, text, "") loc = get_tags(paths[0], "", self.tag_map) loc_entity = format_result(loc, text, "") resp = org_entity["entities"] + per_entity[ "entities"] + loc_entity["entities"] ele.append(str(resp)) with open('data/test_result.txt', 'w', encoding="utf-8") as f1: for ele in dataset: f1.write(ele[2]) f1.write('\t') f1.write(ele[1]) f1.write('\t') f1.write(ele[3]) f1.write('\n') f1.close() '''
def predict(self): f = open("data/data_map.pkl", "rb") maps = cPickle.load(f) f.close() self.vocab = maps.get("vocab", {}) self.tag_map = maps.get("tag_map", {}) self.nums_tags = len(self.tag_map.values()) self.input_size = maps.get("input_size", 10000) + 1 self.__creat_model() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("[->] restore model") self.saver.restore(sess, ckpt.model_checkpoint_path) else: print("[->] no model, initializing") sess.run(tf.global_variables_initializer()) trans = self.trans.eval() while True: text = input(" > ") feed = self.prepare_pred_data(text) logits, length = sess.run([self.logits, self.length], feed_dict=feed) paths = self.decode(logits, length, trans) org = get_tags(paths[0], "ORG", self.tag_map) org_entity = format_result(org, text, "ORG") per = get_tags(paths[0], "PER", self.tag_map) per_entity = format_result(per, text, "PER") resp = org_entity["entities"] + per_entity["entities"] print(json.dumps(resp, indent=2, ensure_ascii=False))
def image_search(request): print request if request.method == 'GET': utils.get_tags(request.GET.get('image_url')) classification = utils.run_nmf() response = {"id": classification} return HttpResponse(json.dumps(response), content_type="application/json") return None
def predict(self, input_str="", input_path=None): if input_path is not None: tests = pd.read_csv(input_path) with open('output.txt', 'w', encoding='utf-8') as o: #o.write('id,aspect,opinion\n') for ids in range(1, 2235): input_str = self.get_string( str(tests.loc[ids - 1:ids - 1, ['Review']])) index = int( self.get_string(str(tests.loc[ids - 1:ids - 1, ['id']]))) input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor if (self.use_gpu): # gpu加速 sentences = torch.tensor(input_vec).view(1, -1).cuda() else: sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) entities = sorted(entities, key=lambda x: x['start']) #print(str(index) + " " + input_str + " " +str(len(entities))) for entity in entities: #print(entity) o.write( str(index) + ',' + entity['type'] + ',' + entity['word'] + '\n') else: if not input_str: input_str = input("请输入文本: ") input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor if (self.use_gpu): # gpu加速 sentences = torch.tensor(input_vec).view(1, -1).cuda() else: sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) return entities
def test(self): with torch.no_grad(): id2vocab = {self.vocab[i]: i for i in self.vocab} print(len(id2vocab)) f = open('./result/test_tag.json', 'w') total_matrix = np.zeros( [len(self.tags), 3] ) #横坐标分别表示component,disease&symptom,people;纵坐标分别表示recall, precision, f1 count = 0 for batch in self.dev_manager.get_batch(): count += 1 print(count) # print(type(items)) sentences, labels, length = zip(*batch) # sentences, labels, length = zip(*self.dev_batch.__next__()) # print('I am in') strs = [[id2vocab[w] for w in s] for s in sentences] # print(strs) # print(len(sentences),len(sentences[0]),len(sentences[5])) _, paths = self.model(sentences) # print("\teval") # print('path',len(paths),len(paths[0]),len(paths[1])) for i in range(len(self.tags)): recall, precision, f1 = f1_score(labels, paths, self.tags[i], self.model.tag_map) total_matrix[i][0] += recall total_matrix[i][1] += precision total_matrix[i][2] += f1 entities = [] for i in range(len(paths)): tmp = [] for tag in self.tags: tags = get_tags(paths[i], tag, self.tag_map) tmp += format_result(tags, strs[i], tag) entities.append(tmp) # print(entities) for i in range(len(entities)): dic = { 'sentense': ''.join(strs[i]), 'entities': entities[i] } json.dump(dic, f, ensure_ascii=False) # f.write(''.join(strs[i])+'#####找到的实体为#####'+'&'.join(entities[i])+'\n') total_matrix /= count # print(total_matrix) for i in range(len(self.tags)): print( "{}\tcount\t{}\trecall {:.2f}\tprecision {:.2f}\tf1 {:.2f}" .format(count, self.tags[i], total_matrix[i][0], total_matrix[i][1], total_matrix[i][2])) f.close()
def predict(self): f = open("model/data_map.pkl", "rb") maps = cPickle.load(f) f.close() self.batch_size = 1 self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(FLAGS.spm) self.train_length = 10 self.tag_map = maps.get("tag_map", {}) self.nums_tags = len(self.tag_map.values()) self.__creat_model() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("[->] restore model") self.saver.restore(sess, ckpt.model_checkpoint_path) else: print("[->] no model, initializing") sess.run(tf.global_variables_initializer()) trans = self.trans.eval() while True: text = input(" > ") feed = self.prepare_xlnet_pred_data(text) paths, length = sess.run([self.pred_ids, self.length], feed_dict=feed) print(format_tags(paths[0], self.tag_map)) org = get_tags(paths[0], "ORG", self.tag_map) org_entity = format_result(org, text, "ORG") per = get_tags(paths[0], "PER", self.tag_map) per_entity = format_result(per, text, "PER") loc = get_tags(paths[0], "LOC", self.tag_map) loc_entity = format_result(loc, text, "LOC") resp = org_entity["entities"] + per_entity[ "entities"] + loc_entity["entities"] print(json.dumps(resp, indent=2, ensure_ascii=False))
def predict(self, input_str=""): if not input_str: input_str = input("请输入文本: ") input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) return entities
def predict(self, tag, input_str=""): model.load_state_dict(torch.load("./model/params.pkl")) if not input_str: input_str = input("请输入文本: ") input_vec = [word2id.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) paths = model(sentences) entities = [] tags = get_tags(paths[0], tag, tag2id) entities += format_result(tags, input_str, tag) print(entities)
def run_model(model_name, vocab, tf): #(vocab,tf) = utils.read_corpus() N = len(tf) # number of docs in corpus weights = [] if model_name == "bm25": weights = bm25(tf) elif model_name == "tfidf": weights = tfidf(tf) top_words = [] for i in range(N): gen_tags = utils.get_tags(vocab, weights, i) top_words += gen_tags return top_words
def run_model(model_name, vocab, tf): speech_info = utils.read_speech_info() N = len(speech_info) # number of docs in corpus weights = [] if model_name == "bm25": weights = bm25(tf) elif model_name == "tfidf": weights = tfidf(tf) top_words = {} for i in range(N): gen_tags = utils.get_tags(vocab, weights, i) top_words[speech_info[i]] = gen_tags return top_words
def run_model(model_name, vocab, tf): #(vocab,tf) = utils.read_corpus() N = len(tf) # number of docs in corpus weights = [] if model_name == "bm25": weights = bm25(tf) elif model_name == "tfidf": weights = tfidf(tf) top_words = [] for i in range(N): gen_tags = utils.get_tags(vocab, weights, i) top_words += gen_tags return top_words
def get_lastest_tag(repo_name, today): logger.debug('Init config') config = init_config(repo_name) environment = config['environments'][env_name] env_key = environment['env_key'][repo_name] tag_re_patten = config['tag_re_patten'] logger.debug('Searched tag patten is {}'.format(tag_re_patten)) git_path = git_folder_path + repo_name logger.debug('Git path is {}'.format(git_path)) logger.debug('Init repo') repo = git.Repo.init(path=git_path) repo.git.fetch() logger.debug('Fetch remote tags') all_tag = repo.git.ls_remote('--tags') tags = get_tags(all_tag) logger.debug('Start to find latest tag') tag_re_patten = tag_re_patten.format(env_key, today) tag_name = find_latest_tag(tag_re_patten, tags) if tag_name is None: logger.warning( 'Can\'t find matched tag of {}. Maybe no tag today, or check tag patten.' .format(repo_name)) return None logger.info('The latest tag of {} is {}'.format(repo_name, tag_name)) return tag_name
def predict(self, input_str=""): if not input_str: input_str = input("请输入文本: ") # 获取输入句子所有汉字的在vocab的索引 input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec, dtype=torch.long).view(1, -1) sentences = sentences.cuda() # paths 预测出来的标签索引 shape 为 [1,1] _, paths = self.model(sentences) entities = [] # "tags": ["ORG", "PER"] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) print(entities) print(json.dumps(entities, indent=4, ensure_ascii=False)) return entities
def predict(self, path): #, input_str=""): # if not input_str: # input_str = input("请输入文本: ") sentences = [] with open('./data/' + path + '.txt', 'r', encoding='utf-8') as f: for i in f: sentences += i.strip().split('。') f = open('./result/tag_' + path + '.json', 'w') for input_str in sentences: input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) dic = {'sentense': input_str, 'entities': entities} json.dump(dic, f, ensure_ascii=False) f.close()
def retrieve_products_for_interest(interest): list_url = "{}{}/{}-gifts{}".format(BASE_URL, LIST_URL, interest, QUERY_STR) html = retrieve_data("uncommon-goods.{}.html".format(interest), list_url) soup = bs(html, "html.parser") prod_links = [link["href"] for link in soup.select("article.product a")] for link in prod_links[:100]: prod_link = "{}{}".format(BASE_URL, link) fname = "{}.{}.html".format("uncommon-goods", link_to_fname(link)) print("Fetching {}...".format(prod_link)) html = retrieve_data(fname, prod_link) soup = bs(html, "html.parser") try: title = soup.find("h1", {"itemprop": "name"}).get_text() title = clean_whitespace(title) description = soup.select_one(".theStoryCopy p").get_text() description = clean_whitespace(description) image = soup.select_one("a#mainImage img")["src"] if not image.startswith("http"): image = "{}{}".format(BASE_URL, image) price = soup.find("span", {"itemprop": "price"}).get_text() price = float(clean_whitespace(price)) tags = get_tags(description) product = Product(title, "{}{}".format(BASE_URL, link), image, interest, tags, description, price=price) product.dump() except Exception as e: print("ERROR:", e) print("")
# -*- coding:utf-8 -*- from .api_base import JsonHandler from utils import get_tags, get_tags_v2, get_tags_parents, get_tags_v3 # get_tags_v2_by_name from db import Tag, Share, User import tornado import time import copy d_tags = get_tags() d_tags_v2 = get_tags_v2() d_tags_v3 = get_tags_v3() d_tags_parents = get_tags_parents() # get_tags_v2_by_name class TagsV2Handler(JsonHandler): def get(self): ver = self.get_argument("ver", 3) name = self.get_argument("name", '') sid = self.get_argument("id", 0) ver = int(ver) sid = int(sid) # parents [0] if name or sid: # 具体某个标签 if not name and sid: tag = Tag.by_sid(sid) name = tag['name'] self.res = d_tags_v3.get(name, {})
def get(self): page = self.get_argument("page", 1) per_page = self.get_argument("per_page", 10) meta_info = self.get_argument("meta_info", None) my_tags = self.get_argument("my_tags", None) tag = self.get_argument('tag', '') per_page = int(per_page) page = int(page) user = None tags = None token = self.request.headers.get('Authorization', '') if token: key, token = token.split() if key == 'token' and token: user_json = self.get_secure_cookie('user', token) if user_json: user = json_decode(user_json) else: user_json = self.get_secure_cookie("user") if user_json: user = json_decode(user_json) print(user) print(my_tags) if user and my_tags: d_user = User.by_sid(user['user_id']) print(d_user, 1111) if d_user: print(d_user['user_tags']) tags = d_user['user_tags'] vote_open = self.get_argument("vote_open", None) has_vote = self.get_argument("has_vote", None) cond = {} if tags: print('1111111111111', tags) cond['tags'] = {"$in": tags} elif tag: cond['tags'] = tag if user: logger.info('user_id: {}'.format(user['user_id'])) if user and user['user_id'] in wx_admin_ids: cond['status'] = {'$gte': 1} else: cond['status'] = {'$gte': 1} if vote_open: if not vote_open.isdigit(): return self.write_error(422) cond['vote_open'] = int(vote_open) if has_vote: cond['vote_title'] = {'$ne': ''} number = Share.find(cond, {'_id': 0}).count() shares = Share.find(cond, {'_id': 0}).sort( '_id', -1).limit(per_page).skip((page - 1) * per_page) shares = [fix_share(share) for share in shares] # if tag: # shares = [share for share in shares if tag in share['tags']] meta = {} if meta_info and tag: d_tags = get_tags() # d_tags_parent = get_tags_parent() d_tags_parents = get_tags_parents() if tag in d_tags: sub_tags = [] print(d_tags[tag]) for name in d_tags[tag]: num = Share.find({'tags': name}, {'_id': 0}).count() num_recent = Share.find( {'tags': name, 'published': {'$gt': time.time()-86400*30}}, {'_id': 0}).count() info = {} info['name'] = name info['num'] = num info['num_recent'] = num_recent sub_tags.append(info) meta['sub_tags'] = sub_tags meta['parent_tags'] = [] if tag in d_tags_parents: # hypernym # meta['parent_tags'].append(d_tags_parent[tag]) meta['parent_tags'] = d_tags_parents[tag] self.res = list(shares) self.meta = meta print(meta) # number=len(self.res) return self.write_json(number=number)
def main(repo_name): logger.debug('init config') config = init_config(repo_name) environment = config['environments'][env_name] env_key = environment['env_key'][repo_name] today = datetime.date.today().strftime('%Y%m%d') logger.debug('get today is ' + today) git_path = git_folder_path + repo_name logger.info('git path is ' + git_path) logger.debug('init repo') repo = git.Repo.init(path=git_path) # Need prune for branch is deleted then created with same name. repo.git.fetch('--prune') logger.debug('get remote tags') all_tag = repo.git.ls_remote('--tags') tags = get_tags(all_tag) logger.debug('find latest tag') tag_re_patten = config['tag_re_patten'] tag_re_patten = tag_re_patten.format(env_key, today) tag_name = find_latest_tag(tag_re_patten, tags) if tag_name is None: raise TagNotFoundException() logger.info('latest tag is ' + tag_name) logger.debug('find latest branch') branches = repo.git.branch('-r').split('\n') merged_branch_name_re_patten = config[ 'merged_branch_name_re_patten'].format(env_key, today) latest_branch = find_latest_branch(merged_branch_name_re_patten, branches) branch_index = get_branch_index(latest_branch) new_branch_name = config['new_branch_name_patten'].format( env_key, today, branch_index) logger.info('new branch is ' + new_branch_name) logger.debug('check branch is exists or not') is_exists = some(branches, lambda b: new_branch_name in b) if is_exists: raise BranchIsExistException(new_branch_name) else: logger.debug('branch is not exists') logger.debug('create branch') repo.git.checkout(tag_name, '-b', new_branch_name) logger.debug('push branch') repo.git.push('origin', new_branch_name) logger.debug('checkout to dev') repo.git.checkout(source_branch) logger.debug('get branch diff commit') all_log = repo.git.log( 'origin/{}..origin/{}'.format(config['uat_branch'], new_branch_name), '--oneline', '--no-merges') logger.debug('build pull request desc') pr_desc = build_pr_desc(all_log) logger.debug('create request service') request_service = RequestService(config['host'], config['headers'], auth) logger.debug('get reviewers') uat_branch = config['uat_branch'] default_reviewers_api = config['default_reviewers_api'].format(repo_name) reviewers = get_reviewers(request_service, default_reviewers_api, uat_branch, new_branch_name) logger.debug('build pull request obj') pr_obj = build_pr_obj(new_branch_name, uat_branch, pr_desc, reviewers) logger.debug('post to create pull request') pull_requests_api = config['pull_requests_api'].format(repo_name) rs = post_pr(request_service, pull_requests_api, pr_obj) if rs.status_code != 201: logger.error('{} {} create pull request failed.'.format( repo_name, new_branch_name)) status_code = rs.status_code result = json.loads(rs.text, encoding='utf-8') message = result['errors'][0]['message'] raise BitbucketException(status_code, message, new_branch_name) logger.info('create pull request success.') logger.info('finish')
def get(self): token = self.request.headers.get('Authorization', '') page = self.get_argument("page", 1) per_page = self.get_argument("per_page", 10) tag = self.get_argument('tag', '') filter_type = self.get_argument("filter_type", '') # my_tags my_likes last_suggested = self.get_argument("last_suggested", 0) read_status = self.get_argument('read_status', 1) meta_info = self.get_argument("meta_info", 1) read_status = int(read_status) per_page = int(per_page) page = int(page) if not last_suggested: last_suggested = 0 last_suggested = float(last_suggested) / 1000 + 1 user = self.get_user_dict(token) cond = {} tags = None if user and filter_type == 'my_tags': d_user = User.by_sid(user['user_id']) if d_user: tags = d_user['user_tags'] # 按照tag来过滤 if tags: cond['tags'] = {"$in": tags} elif tag: cond['tags'] = tag # 不同的用户显示不同级别的推荐 # if user and user['user_id'] in wx_admin_ids: if user and user['user_id'] == 1: cond['status'] = {'$gte': 1} else: cond['status'] = {'$gte': 1} # 已读列表 20ms l_hitted_share_id = [] if user and read_status: hits = Hit.find({'user_id': user['user_id']}, { '_id': 0, 'share_id': 1 }) l_hitted_share_id = [i['share_id'] for i in hits] filter_d = {} filter_d['_id'] = 0 # 白名单里的属性才展示 filter_d['id'] = 1 filter_d['images'] = 1 filter_d['title'] = 1 filter_d['user_id'] = 1 filter_d['tags'] = 1 filter_d['published'] = 1 filter_d['post_img'] = 1 shares = Share.find(cond, filter_d).sort('suggested', -1).limit(per_page).skip( (page - 1) * per_page) # 过滤 new_shares = [] for share in shares: user = User.by_sid(share.user_id) # share = dict(share) share['type'] = 1 # if share.post_img: # if hasattr(share, 'post_img'): if share.get('post_img'): share['type'] = 2 share['images'] = [ IMG_BASE + share['post_img'].replace('_1200.jpg', '_260.jpg') ] share.pop('post_img') else: share['images'] = [] share['author'] = user.user_name share['published'] = int(share['published'] * 1000) # share.published if read_status: share['read'] = bool(share['id'] in l_hitted_share_id) if 0: # 不展示作者头像 if user.user_email.endswith('@wechat'): share['user_img'] = options.site_url + \ get_avatar_by_wechat(user._id) if user.user_email.endswith('@anwensf.com'): share['user_img'] = options.site_url + \ get_avatar_by_feed(user.id) else: share['user_img'] = options.site_url + \ get_avatar(user.user_email, 100) new_shares.append(share) if meta_info: meta = {} if last_suggested: cond_update = copy.deepcopy(cond) cond_update['suggested'] = {'$gt': last_suggested} number_of_update = Share.find(cond_update, { '_id': 0, 'id': 1 }).count() meta['number_of_update'] = number_of_update if tag: # 子标签的文章数量 d_tags = get_tags() d_tags_parents = get_tags_parents() # get_tags_parent if tag in d_tags: sub_tags = [] for name in d_tags[tag]: info = {} info['name'] = name # num = Share.find({'tags': name}, {'_id': 0}).count() # num_recent = Share.find( # {'tags': name, 'published': {'$gt': time.time() - 86400 * 30}}, {'_id': 0}).count() # info['num'] = num # info['num_recent'] = num_recent sub_tags.append(info) meta['sub_tags'] = sub_tags meta['parent_tags'] = [] if tag in d_tags_parents: # meta['parent_tags'].append(d_tags_parent[tag]) meta['parent_tags'] = d_tags_parents[tag] # hypernym number = Share.find(cond, {'_id': 0}).count() # 'id': 1 meta['number'] = number # if filter_type == 'my_tags': # meta['tags'] = tags self.res = {'articles': new_shares} self.meta = meta return self.write_json()
def get(self): page = self.get_argument("page", 1) per_page = self.get_argument("per_page", 10) filter_type = self.get_argument("filter_type", '') # my_tags tag = self.get_argument('tag', '') meta_info = self.get_argument("meta_info", 1) last_suggested = self.get_argument("last_suggested", 0) read_status = self.get_argument('read_status', 1) token = self.request.headers.get('Authorization', '') # has_vote = self.get_argument("has_vote", None) # vote_open = self.get_argument("vote_open", None) read_status = int(read_status) per_page = int(per_page) page = int(page) last_suggested = float(last_suggested) / 1000 + 1 user = self.get_user_dict(token) cond = {} # 按照tags来过滤 tags = None if user and filter_type == 'my_tags': d_user = User.by_sid(user['user_id']) if d_user: tags = d_user['user_tags'] # 按照tag来过滤 if tags: cond['tags'] = {"$in": tags} elif tag: cond['tags'] = tag # 不同的用户显示不同级别的推荐 if user and user['user_id'] in wx_admin_ids: cond['status'] = {'$gte': 1} else: cond['status'] = {'$gte': 1} l_hitted_share_id = [] if user and read_status: hits = Hit.find({'user_id': user['user_id']}) l_hitted_share_id = [i['share_id'] for i in hits] # if vote_open: # if not vote_open.isdigit(): # return self.write_error(422) # cond['vote_open'] = int(vote_open) # if has_vote: # cond['vote_title'] = {'$ne': ''} number = Share.find(cond, {'_id': 0}).count() # sort: _id if last_suggested: cond_update = copy.deepcopy(cond) cond_update['suggested'] = {'$gt': last_suggested} number_of_update = Share.find(cond_update, {'_id': 0}).sort( 'suggested', -1).count() logger.info('number_of_update 1: {}'.format(number_of_update)) num_shares = Share.find(cond, {'_id': 0, 'id': 1}).count() shares = Share.find(cond, {'_id': 0}).sort( 'suggested', -1).limit(per_page).skip((page - 1) * per_page) # shares = [fix_share(share) for share in shares] new_shares = [] for share in shares: share = fix_share(share) user = User.by_sid(share.user_id) share = dict(share) share['user_name'] = user.user_name share['markdown'] = '' if read_status: share['read'] = bool(share['id'] in l_hitted_share_id) soup = BeautifulSoup(share['content'], "lxml") # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) # print(text) share['summary'] = text[:150] share['content'] = '' if user.user_email.endswith('@wechat'): share['user_img'] = options.site_url + \ get_avatar_by_wechat(user._id) if user.user_email.endswith('@anwensf.com'): share['user_img'] = options.site_url + \ get_avatar_by_feed(user.id) else: share['user_img'] = options.site_url + \ get_avatar(user.user_email, 100) new_shares.append(share) # if tag: # shares = [share for share in shares if tag in share['tags']] meta = {} meta['page'] = page meta['articleNumber'] = num_shares if meta_info and last_suggested: meta['number_of_update'] = number_of_update if meta_info and tag: d_tags = get_tags() # d_tags_parent = get_tags_parent() d_tags_parents = get_tags_parents() if tag in d_tags: sub_tags = [] for name in d_tags[tag]: num = Share.find({'tags': name}, {'_id': 0}).count() num_recent = Share.find( {'tags': name, 'published': {'$gt': time.time() - 86400 * 30}}, {'_id': 0}).count() info = {} info['name'] = name info['num'] = num info['num_recent'] = num_recent sub_tags.append(info) meta['sub_tags'] = sub_tags meta['parent_tags'] = [] if tag in d_tags_parents: # hypernym # meta['parent_tags'].append(d_tags_parent[tag]) meta['parent_tags'] = d_tags_parents[tag] logger.info('last_suggested time: {}'.format(last_suggested)) if new_shares: logger.info('new_shares[0] time: {}'.format(new_shares[0]['title'])) logger.info('new_shares[0] published time: {}'.format( new_shares[0]['published'])) logger.info('new_shares[0] suggested time: {}'.format( new_shares[0]['suggested'])) self.res = {'articles': list(new_shares)} self.meta = meta # number=len(self.res) # number=number return self.write_json()
def main(): stc.html(HTML_BANNER) menu = ["Home", "MultiVerse", "About"] df = load_bible("data/KJV_Bible.csv") choice = st.sidebar.selectbox("Menu", menu) if choice == "Home": st.subheader("Single Verse Search") book_list = df["book"].unique().tolist() book_name = st.sidebar.selectbox("Book", book_list) chapter = st.sidebar.number_input("Chapter", 1) verse = st.sidebar.number_input("Verse", 1) bible_df = df[df["book"] == book_name] # Layout c1, c2 = st.beta_columns([2, 1]) # Single Verse Layout with c1: try: selected_passage = bible_df[(bible_df["chapter"] == chapter) & (bible_df["verse"] == verse)] passage_details = "{} Chapter::{} Verse::{}".format( book_name, chapter, verse) st.info(passage_details) passage = "{}".format(selected_passage["text"].values[0]) st.write(passage) except: st.warning("Book out of Range") with c2: # st.success("Verse of the Day") chapter_list = range(10) verse_list = range(20) ch_choice = random.choice(chapter_list) vs_choice = random.choice(verse_list) random_book_name = random.choice(book_list) # st.write("Book:{},Ch:{},Vs:{}".format(random_book_name,ch_choice,vs_choice)) rand_bible_df = df[df["book"] == random_book_name] try: randomly_selected_passage = rand_bible_df[ (rand_bible_df["chapter"] == ch_choice) & (rand_bible_df["verse"] == vs_choice)] mytext = randomly_selected_passage["text"].values[0] except: mytext = rand_bible_df[(rand_bible_df["chapter"] == 1) & ( rand_bible_df["verse"] == 1)]["text"].values[0] stc.html(HTML_RANDOM_TEMPLATE.format(mytext), height=300) # Search Topic/Term search_term = st.text_input("Term/Topic") with st.beta_expander("View Results"): retrieved_df = df[df["text"].str.contains(search_term)] st.dataframe(retrieved_df[["book", "chapter", "verse", "text"]]) elif choice == "MultiVerse": st.subheader("MultiVerse Retrieval") book_list = df["book"].unique().tolist() book_name = st.sidebar.selectbox("Book", book_list) chapter = st.sidebar.number_input("Chapter", 1) bible_df = df[df["book"] == book_name] all_verse = bible_df["verse"].unique().tolist() verse = st.sidebar.multiselect("Verse", all_verse, default=1) selected_passage = bible_df.iloc[verse] st.dataframe(selected_passage) passage_details = "{} Chapter::{} Verse::{}".format( book_name, chapter, verse) st.info(passage_details) # Layout col1, col2 = st.beta_columns(2) # Join all text as a sentence docx = " ".join(selected_passage["text"].tolist()) with col1: st.info("Details") for i, row in selected_passage.iterrows(): st.write(row["text"]) with col2: st.success("StudyMode") with st.beta_expander("Visualize Entities"): # st.write(docx) render_entities(docx) with st.beta_expander("Visualize Pos Tags"): tagged_docx = get_tags(docx) processed_tags = mytag_visualizer(tagged_docx) # st.write(processed_tags)# Raw stc.html(processed_tags, height=1000, scrolling=True) with st.beta_expander("Keywords"): processed_docx = nfx.remove_stopwords(docx) keywords_tokens = get_most_common_tokens(processed_docx, 5) st.write(keywords_tokens) with st.beta_expander("Pos Tags Plot"): tagged_docx = get_tags(docx) tagged_df = pd.DataFrame(tagged_docx, columns=["Tokens", "Tags"]) # st.dataframe(tagged_df) df_tag_count = tagged_df["Tags"].value_counts().to_frame( "counts") df_tag_count["tag_type"] = df_tag_count.index # st.dataframe(df_tag_count) c = alt.Chart(df_tag_count).mark_bar().encode(x="tag_type", y="counts") st.altair_chart(c, use_container_width=True) with st.beta_expander("Verse Curve"): plot_mendelhall_curve(docx) with st.beta_expander("Word Freq Plot"): plot_word_freq_with_altair(docx) else: st.subheader("About") st.text("Build with Streamlit")
def get(self): d_tags = get_tags() self.res = d_tags self.write_json()