def clean_tweets(input, output): # Creates the output text file output_file = open(output,'w') # Creates JSON array out of line delimted JSON file json_input = utility.to_json_array(input) # Initialize unicode count variable unicode_count = 0 # Iterate through all JSON Objects within the array for currentObj in json_input: if 'text' in currentObj: text = parse.parse_json(currentObj, 'text') created_at = parse.parse_json(currentObj, 'created_at') # UNICODE CHECK if contain_unicode(text) or contain_unicode(created_at): unicode_count+=1 text = utility.clean_text(text) created_at = utility.clean_text(created_at) # Write JSON Object Post Output #print(text + ' (timestamp: ' + created_at + ')\n') output_file.write(text + ' (timestamp: ' + created_at + ')\n') # Writing aggregate unicode into output file and test string output_file.write(str(unicode_count) + ' tweet(s) contained unicode.') #print(str(unicode_count) + ' tweet(s) contained unicode.') # Close files output_file.close() print("\nTweet clean completed\nOutput in: " + output)
def main(base_url, page): """主函数""" # 添加_token参数 GET_PARAM["_token"] = encrypt_token() GET_PARAM['page'] = str(page) url = base_url + urlencode(GET_PARAM) # proxies = xdaili_proxy() # session = requests.Session() # response = json.loads(session.get(url, headers=HEADERS, proxies=proxies, timeout=TIMEOUT).text) response = json.loads(requests.get(url, headers=HEADERS, timeout=TIMEOUT).text) try: infos = response['data']['poiInfos'] for info in infos: data = parse_json(info) print(data, sep='\n') save(data) except Exception as e: a=0 print(response)
def main(base_url, page): """主函数""" # 添加_token参数 GET_PARAM["_token"] = encrypt_token() GET_PARAM['page'] = str(page) url = base_url + urlencode(GET_PARAM) # proxies = xdaili_proxy() # session = requests.Session() # response = json.loads(session.get(url, headers=HEADERS, proxies=proxies, timeout=TIMEOUT).text) response = json.loads(requests.get(url, headers=HEADERS, timeout=TIMEOUT).text) try: infos = response['data']['poiInfos'] for info in infos: data = parse_json(info) print(data, sep='\n') save(data) except Exception as e: logging.warning(" Response status code: {}, Requests was found, no target data was obtained!".format(response['code'])) _ = e
def main(base_url, page,cateId,originUrl): """主函数""" # 添加_token参数 GET_PARAM['cateId'] = str(cateId) GET_PARAM["originUrl"]=originUrl SIGN_PARAM = "areaId={}&cateId={}&cityName={}&dinnerCountAttrId={}&optimusCode={}&originUrl={}&page={}&partner={}&platform={}&riskLevel={}&sort={}&userId={}&uuid={}".format( GET_PARAM["areaId"], GET_PARAM["cateId"], GET_PARAM["cityName"], GET_PARAM["dinnerCountAttrId"], GET_PARAM["optimusCode"], GET_PARAM["originUrl"], GET_PARAM["page"], GET_PARAM["partner"], GET_PARAM["platform"], GET_PARAM["riskLevel"], GET_PARAM["sort"], GET_PARAM["userId"], GET_PARAM["uuid"] ) GET_PARAM["_token"] = encrypt_token(SIGN_PARAM) GET_PARAM['page'] = str(page) url = base_url + urlencode(GET_PARAM) # proxies = xdaili_proxy() # session = requests.Session() # response = json.loads(session.get(url, headers=HEADERS, proxies=proxies, timeout=TIMEOUT).text) response = json.loads(requests.get(url, headers=HEADERS, timeout=TIMEOUT).text) try: infos = response['data']['poiInfos'] for info in infos: data = parse_json(info) data['city'] = base_url.split('//')[-1].split('.')[0] data['cateId'] = GET_PARAM['cateId'] print(data, sep='\n') save(data) except Exception as e: logging.warning(" Response status code: {}, Requests was found, no target data was obtained!".format(response['code'])) _ = e
'//div[@id="col-content"]/h1')[0].text_content().split(' - ') match['home'] = name[0] match['away'] = name[1] match['event'] = get_league_info(r.url)[1:] event_request = requests.get( 'http://www.soccer24.com/match/' + match['match_id']) event_tree = html.fromstring(event_request.text) phrases = event_tree.xpath( '//table[@class="detail"]//a/text()')[0].split(' - ')[1:] match['event'] += phrases[::-1] f.write(json.dumps(match) + '\n') except: fail = open("to_reget.dat", 'a+') fail.write(match_id + '\n') fail.close() f.close() if __name__ == "__main__": if os.path.exists(reget_file): os.rename(reget_file, reget_file + '.swp') with open(reget_file + '.swp') as f: for line in f.readlines(): get_match(line.replace('\n', '')) os.remove(reget_file + '.swp') with open('data/reget.json') as reget: for line in reget.readlines(): json_data = json.loads(line) parse.parse_json(json_data) os.remove('data/reget.json')
import parse as p import author_network as an import numpy as np import networkx as nx from matplotlib import pyplot as plt p.parse_json('../data/icml2019.json') an.get_edges() # print(an.co_worker_counts())
def average_degree(input, output): # Creates the output text file output_file = open(output,'w') # String that stores all the content of the output.txt # Used for debugging purposes #test = '' # Creates JSON array out of line delimted JSON file json_input = utility.to_json_array(input) adjacency_list = {} previous_threshold_time = None # Iterates through all objects within JSON Array for current_obj in json_input: # Checks if text exists within json_input if 'text' in current_obj: # Creates list of texts and timestamps of posts text = parse.parse_json(current_obj,'text') created_at = parse.parse_json(current_obj,'created_at') # Cleans texts and timestamps and parses timestamps text = utility.clean_text(text) created_at = utility.clean_text(created_at) parsed_time = parse.parse_timestamps(created_at) # Calculates current post threshold time threshold_time = utility.minute_offset(parsed_time); # Initializes previous threshold time so it can be used # for comparison and skip redundant checks if previous_threshold_time is None: previous_threshold_time = threshold_time # Removes all edges in adjacency list below threshold time if # threshold time was changed and resets threshold time to latest # post's time if threshold_time != previous_threshold_time: adjacency_list = remove_outdated(adjacency_list, threshold_time) previous_threshold_time = threshold_time # Creates # {'text': timestamp} -> {Node: [{edge: , timestamp: } {edge: , timestamp: }]} # Creates adjacency from hashtag list created from the post adjacency_list = identify_hashtags(adjacency_list, text, parsed_time) # Writes adjacency list to text output and testing console #print(debug.debug_adjacency_list(adjacency_list)) #test += debug.debug_adjacency_list(adjacency_list) #output_file.write(debug.debug_adjacency_list(adjacency_list)) # Creates degree list and calculates average degrees degree_list = create_degree_list(adjacency_list) average_degree = utility.calculate_average(degree_list) # Adds degree list and average degrees to output file and # debug text #print(debug.debug_degree_list(degree_list)) #print('Average degree = ' + debug.debug_degrees(degree_list) + str(average_degree)) #output_file.write(debug.debug_degree_list(degree_list)) #output_file.write('Average degree = ' + debug.debug_degrees(degree_list) + ' = ' + str(average_degree) + '\n') #print(str(average_degree)) output_file.write(str(average_degree) + '\n') #test += debug.debug_degree_list(degree_list) #test += 'Average degree = ' + str(average_degree)+'\n\n' # Close files output_file.close() print("\nAverage degree completed\nOutput in: " + output)
match['event'] = league[1:-1] + \ [get_season(league, r.url)] + \ phrases[::-1] f.write(json.dumps(match) + '\n') # print(json.dumps(match) + '\n') except: fail = open("to_reget.dat", 'a+') fail.write(match_id + '\n') fail.close() f.close() if __name__ == "__main__": mkdir_p('data') if sys.argv[1] == 'retry': if os.path.exists(reget_file): os.rename(reget_file, reget_file + '.swp') with open(reget_file + '.swp') as f: for line in f.readlines(): get_match(line.replace('\n', '')) os.remove(reget_file + '.swp') with open('data/reget.json') as reget: for line in reget.readlines(): json_data = json.loads(line) parse.parse_json(json_data) os.remove('data/reget.json') else: get_matches(int(sys.argv[1]))
answer_id) print(url, end='') headers = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", } try: answer_id += 1 r = requests.get(url, headers=headers) json_read = json.loads(r.content) if not ('error' in json_read.keys()): parsed, length = parse_json(json_read) if length > 500: print(' recorded') with open('zhihu_corpus_' + str(file_num) + '.json', 'a') as f: f.write(parsed + '\n') if os.path.getsize('zhihu_corpus_' + str(file_num) + '.json') / 1000000 > 100: file_num += 1 else: print('') else: print('') except: print('\n sleeping...') time.sleep(5)
import os os.environ['DJANGO_SETTINGS_MODULE'] = 'betmanager.settings' import get import parse import sys import json filename = get.get_matches(int(sys.argv[1])) print(filename) with open(filename) as f: lines = f.readlines() for line in lines: result = json.loads(line) parse.parse_json(result)