예제 #1
0
def search(Q_str,
           algorithm='bert',
           topN=10,
           is_qe=False,
           additional_attrs=['retweet_count', 'followers_count']):
    # Query expansion & Preprocess query
    Q_str = '电视剧电影'
    Q = query_expansion(Q_str, 'title', is_qe)
    print('Q', Q)

    # Find tweets that overlaps with keywords of original query
    post_ids = get_candidates(Q)

    # Number of candidate documents is smaller than output number
    if len(post_ids) <= topN:
        return post_ids

    # Load candidate tweets from database
    tweets = load_tweets_from_db(post_ids)

    # Preprocess documents
    if algorithm == 'bert':
        # Extract precalculated embeddings
        D = extract_info(lines, 'vec')
    else:
        # Extract and preprocess texts
        texts = extract_info(lines, 'text')
        D = preprocess(texts)

    # Estimate the degree of similarity between query and documents
    scores = similarity_score(D, Q, algorithm)

    # Compute overall scores including popularity
    overall_scores = overall_score(scores, tweets,
                                   ['retweet_count', 'followers_count'],
                                   [0.1, 0.1])

    # Sort
    topN_idxs = get_topN_idxs(overall_scores, topN)
    result_post_ids = [post_ids[idx] for idx in topN_idxs]
    return result_post_ids
예제 #2
0
def fetch_pl(pl_url):
    pl = parse_arg_to_pl(pl_url)
    cmd = compose_pl_info_cmd(pl["url"])
    out = subprocess.run(cmd, capture_output=True, encoding="utf-8")

    infos = []
    for jsonstr in filter(bool, out.stdout.split("\n")):
        j = json.loads(jsonstr)
        info = extract_info(j)
        info["downloaded"] = False

        if not pl.get("title"):
            pl["title"] = info["playlist_title"]

        del info["playlist_title"]
        infos.append(info)

    pl["eps"] = sorted(infos, key=lambda x: x["index"])

    return pl
def lambda_handler(event, context):
    crawler = Crawler()
    register_resp = crawler.run()
    register_html = utils.extract_info(register_resp.content)
    register_text = register_resp.text
    register_confirmation_code = utils.get_confirmation_code(
        register_resp.content)

    if "circle-success" in register_text:
        email_html = "<html><body style=\"background-color: black\">{}</body></html>".format(
            register_html)
        print("Successfully registered and sending email")
        send_confirmation_email(
            email_html, register_text,
            f"Register2Park Success ({register_confirmation_code})!")
    else:
        error_message = "Register2Park Confirmation Failed!"

        print(register_html)
        send_confirmation_email(None, error_message, error_message)
예제 #4
0
def get_result(Q_str,
               algorithm='bert',
               topN=10,
               is_qe=True,
               additional_attrs=['retweet_count', 'followers_count']):
    # query expansion & preprocess query
    Q_str = '电视剧电影'
    Q = query_expansion(Q_str, 'title', is_qe)
    print('Q', Q)

    # find tweets that overlaps with keywords of original query
    post_ids = get_candidates(Q)
    # FIXME: LiXiangHe needs to implement load_tweets_from_db
    # tweets = [dict]
    tweets = load_tweets_from_db(post_ids)

    # extract text
    texts = extract_info(tweets, 'text')

    # preprocess texts
    D = preprocess(texts)

    # number of candidate documents is smaller than output documents number
    if len(D) <= topN:
        return post_ids

    # estimate the degree of similarity between query and documents
    scores = similarity_score(D, Q, algorithm)

    # compute overall scores including popularity
    overall_scores = overall_score(scores, tweets,
                                   ['retweet_count', 'followers_count'])

    # sort
    topN_idxs = get_topN_idxs(overall_scores, topN)
    results = [post_ids[idx] for idx in topN_idxs]
    return results
예제 #5
0
from utils import extract_info, analyse_for_each_repo

file_name = "sample.json"
actions = extract_info(file_name=file_name)

repos = actions["repos"]
information = actions["information"]

for i in range(len(repos)):
    cnt = analyse_for_each_repo(repo_name=repos[i], data=information)
    print(cnt)
 We will have a dictionary of ip address contaning these information
 ordered by stop datetime
"""
open_sessions = OrderedDict()
"""
 Loop through file line by line
 Note: it would be more efficent to read by chunk of lines
 but it falls outside the scope of this project
"""
with open(logfile_path, 'r') as f, open(sessionfile_path, 'w') as fout:

    # figure out header-index mapping
    indexes = utils.extract_header(f, header_list)

    # loop through each line
    for line_str in f:

        # extract relevant information
        ip_address, current_datetime = utils.extract_info(line_str, indexes)

        # detect closed sessions and write them to output file
        ip2close = utils.close_sessions(open_sessions, current_datetime,
                                        timeout_delta, fout)

        # extend current session or create new one
        open_sessions = utils.update_sessions(open_sessions, current_datetime,
                                              ip_address)

    # no more line to read, end all session
    utils.end_all_session(open_sessions, fout)
예제 #7
0
    for line in lines:
        line['vec'] = [0.1] * 768

    # query expansion & preprocess query
    orig_Q = '爱情电影'
    Q = query_expansion(orig_Q, 'title', False)
    print('new Q:', Q)

    # Filter irrelevant documents
    #   post_ids = get_candidates(orig_Q)
    post_ids = [line['post_id'] for line in lines]

    # Preprocess documents
    if algorithm == 'bert':
        # Extract precalculated embeddings
        D = extract_info(lines, 'vec')
    else:
        # Extract texts
        texts = extract_info(lines, 'text')
        D = preprocess(texts)

    # Return all documents if the number of candidate documents is
    # smaller than output documents number
    topN = 10
    if len(D) <= topN:
        for d in D:
            print(d)
        results = post_ids
        sys.exit()

    # Estimate the degree of similarity between query and documents