Пример #1
0
def clean_tweets(input, output):
    # Creates the output text file
    output_file = open(output,'w')

    # Creates JSON array out of line delimted JSON file
    json_input = utility.to_json_array(input)

    # Initialize unicode count variable
    unicode_count = 0

    # Iterate through all JSON Objects within the array
    for currentObj in json_input:
        if 'text' in currentObj:
            text = parse.parse_json(currentObj, 'text')
            created_at = parse.parse_json(currentObj, 'created_at')

            # UNICODE CHECK
            if contain_unicode(text) or contain_unicode(created_at):
                unicode_count+=1
                text = utility.clean_text(text)
                created_at = utility.clean_text(created_at)

            # Write JSON Object Post Output
            #print(text + ' (timestamp: ' + created_at + ')\n')
            output_file.write(text + ' (timestamp: ' + created_at + ')\n')

    # Writing aggregate unicode into output file and test string
    output_file.write(str(unicode_count) + ' tweet(s) contained unicode.')
    #print(str(unicode_count) + ' tweet(s) contained unicode.')

    # Close files
    output_file.close()
    print("\nTweet clean completed\nOutput in: " + output)
Пример #2
0
def main(base_url, page):
    """主函数"""
    # 添加_token参数
    GET_PARAM["_token"] = encrypt_token()
    GET_PARAM['page'] = str(page)
    url = base_url + urlencode(GET_PARAM)
    # proxies = xdaili_proxy()
    # session = requests.Session()
    # response = json.loads(session.get(url, headers=HEADERS, proxies=proxies, timeout=TIMEOUT).text)
    response = json.loads(requests.get(url, headers=HEADERS, timeout=TIMEOUT).text)
    try:
        infos = response['data']['poiInfos']
        for info in infos:
            data = parse_json(info)
            print(data, sep='\n')
            save(data)
    except Exception as e:
        a=0
        print(response)
Пример #3
0
def main(base_url, page):
    """主函数"""
    # 添加_token参数
    GET_PARAM["_token"] = encrypt_token()
    GET_PARAM['page'] = str(page)
    url = base_url + urlencode(GET_PARAM)
    # proxies = xdaili_proxy()
    # session = requests.Session()
    # response = json.loads(session.get(url, headers=HEADERS, proxies=proxies, timeout=TIMEOUT).text)
    response = json.loads(requests.get(url, headers=HEADERS, timeout=TIMEOUT).text)
    try:
        infos = response['data']['poiInfos']
        for info in infos:
            data = parse_json(info)
            print(data, sep='\n')
            save(data)
    except Exception as e:
        logging.warning(" Response status code: {}, Requests was found, no target data was obtained!".format(response['code']))
        _ = e
Пример #4
0
def main(base_url, page,cateId,originUrl):
    """主函数"""
    # 添加_token参数
    GET_PARAM['cateId'] = str(cateId)
    GET_PARAM["originUrl"]=originUrl
    SIGN_PARAM = "areaId={}&cateId={}&cityName={}&dinnerCountAttrId={}&optimusCode={}&originUrl={}&page={}&partner={}&platform={}&riskLevel={}&sort={}&userId={}&uuid={}".format(
        GET_PARAM["areaId"],
        GET_PARAM["cateId"],
        GET_PARAM["cityName"],
        GET_PARAM["dinnerCountAttrId"],
        GET_PARAM["optimusCode"],
        GET_PARAM["originUrl"],
        GET_PARAM["page"],
        GET_PARAM["partner"],
        GET_PARAM["platform"],
        GET_PARAM["riskLevel"],
        GET_PARAM["sort"],
        GET_PARAM["userId"],
        GET_PARAM["uuid"]
    )
    GET_PARAM["_token"] = encrypt_token(SIGN_PARAM)
    GET_PARAM['page'] = str(page)
    url = base_url + urlencode(GET_PARAM)
    # proxies = xdaili_proxy()
    # session = requests.Session()
    # response = json.loads(session.get(url, headers=HEADERS, proxies=proxies, timeout=TIMEOUT).text)
    response = json.loads(requests.get(url, headers=HEADERS, timeout=TIMEOUT).text)
    try:
        infos = response['data']['poiInfos']
        for info in infos:
            data = parse_json(info)
            data['city'] = base_url.split('//')[-1].split('.')[0]
            data['cateId'] = GET_PARAM['cateId']
            print(data, sep='\n')
            save(data)
    except Exception as e:
        logging.warning(" Response status code: {}, Requests was found, no target data was obtained!".format(response['code']))
        _ = e
Пример #5
0
            '//div[@id="col-content"]/h1')[0].text_content().split(' - ')
        match['home'] = name[0]
        match['away'] = name[1]
        match['event'] = get_league_info(r.url)[1:]
        event_request = requests.get(
            'http://www.soccer24.com/match/' + match['match_id'])
        event_tree = html.fromstring(event_request.text)
        phrases = event_tree.xpath(
            '//table[@class="detail"]//a/text()')[0].split(' - ')[1:]
        match['event'] += phrases[::-1]
        f.write(json.dumps(match) + '\n')
    except:
        fail = open("to_reget.dat", 'a+')
        fail.write(match_id + '\n')
        fail.close()
    f.close()


if __name__ == "__main__":
    if os.path.exists(reget_file):
        os.rename(reget_file, reget_file + '.swp')
        with open(reget_file + '.swp') as f:
            for line in f.readlines():
                get_match(line.replace('\n', ''))
        os.remove(reget_file + '.swp')
        with open('data/reget.json') as reget:
            for line in reget.readlines():
                json_data = json.loads(line)
                parse.parse_json(json_data)
        os.remove('data/reget.json')
Пример #6
0
import parse as p
import author_network as an
import numpy as np
import networkx as nx
from matplotlib import pyplot as plt

p.parse_json('../data/icml2019.json')

an.get_edges()

# print(an.co_worker_counts())
Пример #7
0
def average_degree(input, output):
    # Creates the output text file
    output_file = open(output,'w')

    # String that stores all the content of the output.txt
    # Used for debugging purposes
    #test = ''

    # Creates JSON array out of line delimted JSON file
    json_input = utility.to_json_array(input)

    adjacency_list = {}
    previous_threshold_time = None

    # Iterates through all objects within JSON Array
    for current_obj in json_input:
        # Checks if text exists within json_input
        if 'text' in current_obj:
            # Creates list of texts and timestamps of posts
            text = parse.parse_json(current_obj,'text')
            created_at = parse.parse_json(current_obj,'created_at')

            # Cleans texts and timestamps and parses timestamps
            text = utility.clean_text(text)
            created_at = utility.clean_text(created_at)
            parsed_time = parse.parse_timestamps(created_at)

            # Calculates current post threshold time
            threshold_time = utility.minute_offset(parsed_time);

            # Initializes previous threshold time so it can be used
            # for comparison and skip redundant checks
            if previous_threshold_time is None:
                previous_threshold_time = threshold_time

            # Removes all edges in adjacency list below threshold time if
            # threshold time was changed and resets threshold time to latest
            # post's time
            if threshold_time != previous_threshold_time:
                adjacency_list = remove_outdated(adjacency_list, threshold_time)
            previous_threshold_time = threshold_time

            # Creates
            # {'text': timestamp} -> {Node: [{edge: , timestamp: } {edge: , timestamp: }]}
            # Creates adjacency from hashtag list created from the post
            adjacency_list = identify_hashtags(adjacency_list, text, parsed_time)

            # Writes adjacency list to text output and testing console
            #print(debug.debug_adjacency_list(adjacency_list))
            #test += debug.debug_adjacency_list(adjacency_list)
            #output_file.write(debug.debug_adjacency_list(adjacency_list))

            # Creates degree list and calculates average degrees
            degree_list = create_degree_list(adjacency_list)
            average_degree = utility.calculate_average(degree_list)

            # Adds degree list and average degrees to output file and
            # debug text
            #print(debug.debug_degree_list(degree_list))
            #print('Average degree = ' + debug.debug_degrees(degree_list) + str(average_degree))
            #output_file.write(debug.debug_degree_list(degree_list))
            #output_file.write('Average degree = ' + debug.debug_degrees(degree_list) + ' = ' + str(average_degree) + '\n')
            #print(str(average_degree))
            output_file.write(str(average_degree) + '\n')
            #test += debug.debug_degree_list(degree_list)
            #test += 'Average degree = ' + str(average_degree)+'\n\n'

    # Close files
    output_file.close()
    print("\nAverage degree completed\nOutput in: " + output)
Пример #8
0
        match['event'] = league[1:-1] + \
        [get_season(league, r.url)] + \
        phrases[::-1]

        f.write(json.dumps(match) + '\n')
        # print(json.dumps(match) + '\n')
    except:
        fail = open("to_reget.dat", 'a+')
        fail.write(match_id + '\n')
        fail.close()
    f.close()


if __name__ == "__main__":
    mkdir_p('data')
    if sys.argv[1] == 'retry':
        if os.path.exists(reget_file):
            os.rename(reget_file, reget_file + '.swp')
        with open(reget_file + '.swp') as f:
            for line in f.readlines():
                get_match(line.replace('\n', ''))
        os.remove(reget_file + '.swp')
        with open('data/reget.json') as reget:
            for line in reget.readlines():
                json_data = json.loads(line)
                parse.parse_json(json_data)
        os.remove('data/reget.json')
    else:
        get_matches(int(sys.argv[1]))
Пример #9
0
     answer_id)
 print(url, end='')
 headers = {
     "Accept":
     "*/*",
     "Accept-Encoding":
     "gzip, deflate",
     "User-Agent":
     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
 }
 try:
     answer_id += 1
     r = requests.get(url, headers=headers)
     json_read = json.loads(r.content)
     if not ('error' in json_read.keys()):
         parsed, length = parse_json(json_read)
         if length > 500:
             print('   recorded')
             with open('zhihu_corpus_' + str(file_num) + '.json',
                       'a') as f:
                 f.write(parsed + '\n')
                 if os.path.getsize('zhihu_corpus_' + str(file_num) +
                                    '.json') / 1000000 > 100:
                     file_num += 1
         else:
             print('')
     else:
         print('')
 except:
     print('\n sleeping...')
     time.sleep(5)
Пример #10
0
import os

os.environ['DJANGO_SETTINGS_MODULE'] = 'betmanager.settings'

import get
import parse
import sys
import json

filename = get.get_matches(int(sys.argv[1]))
print(filename)
with open(filename) as f:
    lines = f.readlines()
for line in lines:
    result = json.loads(line)
    parse.parse_json(result)