def search(Q_str, algorithm='bert', topN=10, is_qe=False, additional_attrs=['retweet_count', 'followers_count']): # Query expansion & Preprocess query Q_str = '电视剧电影' Q = query_expansion(Q_str, 'title', is_qe) print('Q', Q) # Find tweets that overlaps with keywords of original query post_ids = get_candidates(Q) # Number of candidate documents is smaller than output number if len(post_ids) <= topN: return post_ids # Load candidate tweets from database tweets = load_tweets_from_db(post_ids) # Preprocess documents if algorithm == 'bert': # Extract precalculated embeddings D = extract_info(lines, 'vec') else: # Extract and preprocess texts texts = extract_info(lines, 'text') D = preprocess(texts) # Estimate the degree of similarity between query and documents scores = similarity_score(D, Q, algorithm) # Compute overall scores including popularity overall_scores = overall_score(scores, tweets, ['retweet_count', 'followers_count'], [0.1, 0.1]) # Sort topN_idxs = get_topN_idxs(overall_scores, topN) result_post_ids = [post_ids[idx] for idx in topN_idxs] return result_post_ids
def fetch_pl(pl_url): pl = parse_arg_to_pl(pl_url) cmd = compose_pl_info_cmd(pl["url"]) out = subprocess.run(cmd, capture_output=True, encoding="utf-8") infos = [] for jsonstr in filter(bool, out.stdout.split("\n")): j = json.loads(jsonstr) info = extract_info(j) info["downloaded"] = False if not pl.get("title"): pl["title"] = info["playlist_title"] del info["playlist_title"] infos.append(info) pl["eps"] = sorted(infos, key=lambda x: x["index"]) return pl
def lambda_handler(event, context): crawler = Crawler() register_resp = crawler.run() register_html = utils.extract_info(register_resp.content) register_text = register_resp.text register_confirmation_code = utils.get_confirmation_code( register_resp.content) if "circle-success" in register_text: email_html = "<html><body style=\"background-color: black\">{}</body></html>".format( register_html) print("Successfully registered and sending email") send_confirmation_email( email_html, register_text, f"Register2Park Success ({register_confirmation_code})!") else: error_message = "Register2Park Confirmation Failed!" print(register_html) send_confirmation_email(None, error_message, error_message)
def get_result(Q_str, algorithm='bert', topN=10, is_qe=True, additional_attrs=['retweet_count', 'followers_count']): # query expansion & preprocess query Q_str = '电视剧电影' Q = query_expansion(Q_str, 'title', is_qe) print('Q', Q) # find tweets that overlaps with keywords of original query post_ids = get_candidates(Q) # FIXME: LiXiangHe needs to implement load_tweets_from_db # tweets = [dict] tweets = load_tweets_from_db(post_ids) # extract text texts = extract_info(tweets, 'text') # preprocess texts D = preprocess(texts) # number of candidate documents is smaller than output documents number if len(D) <= topN: return post_ids # estimate the degree of similarity between query and documents scores = similarity_score(D, Q, algorithm) # compute overall scores including popularity overall_scores = overall_score(scores, tweets, ['retweet_count', 'followers_count']) # sort topN_idxs = get_topN_idxs(overall_scores, topN) results = [post_ids[idx] for idx in topN_idxs] return results
from utils import extract_info, analyse_for_each_repo file_name = "sample.json" actions = extract_info(file_name=file_name) repos = actions["repos"] information = actions["information"] for i in range(len(repos)): cnt = analyse_for_each_repo(repo_name=repos[i], data=information) print(cnt)
We will have a dictionary of ip address contaning these information ordered by stop datetime """ open_sessions = OrderedDict() """ Loop through file line by line Note: it would be more efficent to read by chunk of lines but it falls outside the scope of this project """ with open(logfile_path, 'r') as f, open(sessionfile_path, 'w') as fout: # figure out header-index mapping indexes = utils.extract_header(f, header_list) # loop through each line for line_str in f: # extract relevant information ip_address, current_datetime = utils.extract_info(line_str, indexes) # detect closed sessions and write them to output file ip2close = utils.close_sessions(open_sessions, current_datetime, timeout_delta, fout) # extend current session or create new one open_sessions = utils.update_sessions(open_sessions, current_datetime, ip_address) # no more line to read, end all session utils.end_all_session(open_sessions, fout)
for line in lines: line['vec'] = [0.1] * 768 # query expansion & preprocess query orig_Q = '爱情电影' Q = query_expansion(orig_Q, 'title', False) print('new Q:', Q) # Filter irrelevant documents # post_ids = get_candidates(orig_Q) post_ids = [line['post_id'] for line in lines] # Preprocess documents if algorithm == 'bert': # Extract precalculated embeddings D = extract_info(lines, 'vec') else: # Extract texts texts = extract_info(lines, 'text') D = preprocess(texts) # Return all documents if the number of candidate documents is # smaller than output documents number topN = 10 if len(D) <= topN: for d in D: print(d) results = post_ids sys.exit() # Estimate the degree of similarity between query and documents