def extract_insult_tweets(self): try: insults_df = read_csv(self.loc.format('data/insults.csv')) except FileNotFoundError: self.logger.critical('Insults file not found. extract_insults.py must be run first') # get the schema of a Tweet tweets_df = json_normalize(self.api.get_status(insults_df.loc[0, 'tweet_id'])._json) tweets_df.drop(0, inplace=True) # delete the data # build a DF of tweet data for each tweet in insults data # iterate in chunks so we can use twitter's GET statuses endpoint for bulk search n_insults = len(insults_df) chunksize = 100 cursor = 0 while cursor < n_insults: cursor_end_pos = min([cursor + chunksize - 1, n_insults - 1]) self.logger.debug('Loading tweets {0}-{1}'.format(cursor, cursor_end_pos)) tweet_ids = insults_df.ix[cursor: cursor_end_pos, 'tweet_id'].tolist() res = self.api.statuses_lookup(tweet_ids) for item in res: tweet = json_normalize(item._json) tweets_df = tweets_df.append(tweet) cursor += chunksize self.insult_tweets_df = tweets_df
def decode_report(rpt_path): #read report from json into a dict with open(rpt_path, 'r') as f: read_rpt = json.loads(f.read()) #parse the geojson def df_clean(uncleandf): cleaned_cols = [x.split('.')[-1] for x in uncleandf.columns] uncleandf.columns = cleaned_cols clean_df = uncleandf.rename(columns={'coordinates':'coords'}).drop(['type'], axis=1) clean_df = clean_df.set_index(['Name']) return clean_df #parse conduit data into a dataframe conds_df = json_normalize(read_rpt['conduits']['features']) conds_df = df_clean(conds_df) #parse node data into a dataframe nodes_df = json_normalize(read_rpt['nodes']['features']) nodes_df = df_clean(nodes_df) #parse parcel data into a dataframe pars_df = json_normalize(read_rpt['parcels']['features']) pars_df = df_clean(pars_df) rpt_dict = {'conduits':conds_df, 'nodes':nodes_df, 'parcels':pars_df} rpt_dict.update() return {'conduits':conds_df, 'nodes':nodes_df, 'parcels':pars_df}
def main(): logger = get_root_logger() get_header(logger, 'LOADING PROJECTIONS') client = APIClient() # grab dataframe shape from a trial run data = client.get_data('weekly-projections', 'json', 'QB') test_df = json_normalize(data['Projections']) # get DF structure from columns in test_df cols = test_df.columns df = DataFrame(columns=cols) # grab current week current_week = test_df.week.values[0] # loop through all weeks up to current week for wk in [str(x) for x in range(int(current_week))]: logger.info('Processing projections for week {0}'.format(int(wk) + 1)) # loop through all positions for pos in ['QB', 'RB', 'WR', 'TE', 'K', 'DEF']: tmp_data = client.get_data('weekly-projections', 'json', pos, wk) tmp_df = json_normalize(tmp_data['Projections']) df = df.append(tmp_df) # import this df directly to PG DB conn = DBClient() conn.load(df, 'projections', schema='raw', if_exists='replace')
def find_top_major_project_themes(n): str_json = json.load((open('data/world_bank_projects.json'))) normalized_df = json_normalize(str_json, 'mjtheme_namecode') # remove duplicates deduped_df = normalized_df.drop_duplicates() # create a dictionary of theme code and name project_theme_dict = {} for index, row in deduped_df.iterrows(): theme_name = row['name'] if(len(theme_name) > 0): project_theme_dict[row['code']] = theme_name theme_code_series = json_df['mjtheme_namecode'] for item in theme_code_series.iteritems(): theme_list = item[1] for theme in theme_list: theme_name = theme['name'] if (len(theme_name) == 0): theme_code = theme['code'] theme['name'] = str(project_theme_dict[theme_code]) # print(json_df['mjtheme_namecode'].value_counts()[:10]) # json_df['mjtheme_namecode'] = theme_code_series # print(json_df['mjtheme_namecode'].value_counts()[:10]) #print(theme_code_series) str_json = json_df.to_string() normalized_df = json_normalize(str_json, 'mjtheme_namecode') print('\n\n' + str(normalized_df.code.value_counts()[:10]))
def test_json_normalize_errors(self): # GH14583: If meta keys are not always present # a new option to set errors='ignore' has been implemented i = { "Trades": [{ "general": { "tradeid": 100, "trade_version": 1, "stocks": [{ "symbol": "AAPL", "name": "Apple", "price": "0" }, { "symbol": "GOOG", "name": "Google", "price": "0" } ] } }, { "general": { "tradeid": 100, "stocks": [{ "symbol": "AAPL", "name": "Apple", "price": "0" }, { "symbol": "GOOG", "name": "Google", "price": "0" } ] } } ] } j = json_normalize(data=i['Trades'], record_path=[['general', 'stocks']], meta=[['general', 'tradeid'], ['general', 'trade_version']], errors='ignore') expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'}, 'price': {0: '0', 1: '0', 2: '0', 3: '0'}, 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}} assert j.fillna('').to_dict() == expected msg = ("Try running with errors='ignore' as key 'trade_version'" " is not always present") with pytest.raises(KeyError, match=msg): json_normalize( data=i['Trades'], record_path=[['general', 'stocks']], meta=[['general', 'tradeid'], ['general', 'trade_version']], errors='raise')
def search(self, query, max_tweets=200, remove_rts=True, hard_remove=True): # Search API only allows 100 tweets per page max_pages = int(max_tweets) // 100 if max_pages < 1: max_pages = 1 if max_tweets < 100: count = int(max_tweets) else: count = 100 # Prepare query if remove_rts: query += ' -filter:retweets' if hard_remove: query += ' -RT' # eliminates anything with RT, which may not always be a retweet # encoded_query = urllib.quote_plus(query) page = 0 url = 'https://api.twitter.com/1.1/search/tweets.json' for i in range(max_pages): if page == 0: params = {'q': query, 'result_type': 'recent', 'count': count, 'lang': 'en'} else: max_id = data[-1]['id'] - 1 params = {'q': query, 'result_type': 'recent', 'count': count, 'lang': 'en', 'max_id': max_id} r = requests.get(url, auth=self.auth, params=params) data = simplejson.loads(r.text)['statuses'] if len(data) == 0: if self.verbose: print('No more results found') break if page == 0: df = json_normalize(data) else: df = df.append(json_normalize(data)) page += 1 # Check that all columns are there, if not add empty ones for col in self.columns: if col not in df.columns: df[col] = pd.Series([np.nan] * len(df), index=df.index) if len(self.tweets) == 0: self.tweets = df[self.columns] else: self.tweets = self.merge(df[self.columns]) # Filter by location if self.track_location: if self.verbose: print('Filtering by location') self.get_geo() return
def _stats(self,data): if len(data['intervals']) > 0: output = json_normalize(data,'intervals', ['system_id','total_devices']).set_index(['system_id', 'end_at']) else: output = json_normalize(data).set_index('system_id') return output
def _monthly_production(self,data): if len(data['meter_readings']) > 0: output = json_normalize(data,'meter_readings', ['start_date','system_id','end_date','production_wh']) else: output = json_normalize(data,meta= ['start_date','system_id','end_date','production_wh']) return output.set_index(['system_id','start_date','end_date'])
def timeline(self, max_tweets=200, exclude_replies='true', include_rts='false'): """ Load Twitter timeline for the specified user :param screen_name: Twitter screen name to process :param max_tweets: maximum number of tweets to get. (Default: 3200, the API maximum) :param exclude_replies: exclude replies? (Default: true) :param include_rts: include retweets? (Default: false) :return: pandas DataFrame of tweets """ # API only allows up to 200 tweets per page max_pages = int(min(max_tweets, 3200) // 200) if max_pages < 1: max_pages = 1 # Need to be strings not booleans if isinstance(exclude_replies, type(True)): exclude_replies = str(exclude_replies).lower() if isinstance(include_rts, type(True)): include_rts = str(include_rts).lower() if max_tweets < 200: count = int(max_tweets) else: count = 200 page = 0 url = 'https://api.twitter.com/1.1/statuses/user_timeline.json' for i in range(max_pages): if page == 0: params = {'screen_name': self.screen_name, 'count': count, 'lang': 'en', 'exclude_replies': exclude_replies, 'include_rts': include_rts} else: max_id = data[-1]['id'] - 1 params = {'screen_name': self.screen_name, 'count': count, 'lang': 'en', 'exclude_replies': exclude_replies, 'include_rts': include_rts, 'max_id': max_id} r = requests.get(url, auth=self.auth, params=params) data = simplejson.loads(r.text) if page == 0: df = json_normalize(data) else: df = df.append(json_normalize(data), ignore_index=True) page += 1 if len(self.tweets) == 0: self.tweets = df[self.columns] else: self.tweets = self.merge(df[self.columns])
def get_dataset_details_for_classification_collection(context,classification,collection_name,api_key): collections = get_collections_for_classification(context,classification,api_key) collections = collections[collections.name == collection_name] urls = pandas.concat([json_normalize(utl.get(base_url + '/' + a)['json']) for a in collections['url']]) #http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python urls = [url['href'] for urllist in urls['ons.collectionDetail.urls.url'].values for url in urllist if url['@representation']=='json'] details = pandas.concat([json_normalize(utl.get(base_url + '/' + url)['json']) for url in urls]) # get the english csvs details['file_url'] = [a['href']['$'] for a in details['ons.datasetDetail.documents.document'] for a in a if (a['@type']=='CSV') & (a['href']['@xml.lang']=='en')] details['geography'] = [b['$'] for b in details['ons.datasetDetail.geographicalHierarchies.geographicalHierarchy.names.name'] for b in b if b['@xml.lang'] =='en'] details['collection_name'] = collection_name return details[['collection_name','geography','file_url']]
def test_json_normalize_errors(self, missing_metadata): # GH14583: # If meta keys are not always present a new option to set # errors='ignore' has been implemented msg = ("Try running with errors='ignore' as key 'name'" " is not always present") with pytest.raises(KeyError, match=msg): json_normalize( data=missing_metadata, record_path='addresses', meta='name', errors='raise')
def df_from_json(data, **kwargs): """Attempt to produce row oriented data from hierarchical json/dict-like data.""" if isinstance(data, str): with open(data) as data_file: data = json.load(data_file) if isinstance(data, list): return json_normalize(data, kwargs) elif isinstance(data, dict): for k, v in iteritems(data): if isinstance(v, list): return json_normalize(v)
def test_meta_name_conflict(self): data = [{'foo': 'hello', 'bar': 'there', 'data': [{'foo': 'something', 'bar': 'else'}, {'foo': 'something2', 'bar': 'else2'}]}] with pytest.raises(ValueError): json_normalize(data, 'data', meta=['foo', 'bar']) result = json_normalize(data, 'data', meta=['foo', 'bar'], meta_prefix='meta') for val in ['metafoo', 'metabar', 'foo', 'bar']: assert val in result
def flatten_dictionary(returned_dictionary,review,call,df,commentDb,line): if call == "keywords": first_level=json_normalize(returned_dictionary[review],call,['rating','sail_date',"ship","line"]) second_level=json_normalize(returned_dictionary[review][call]) together=pd.merge(first_level, second_level, on='text', how='outer') df=pd.concat([df, together]) else: if review in commentDb["Msc"]: rating=commentDb["Msc"][review]["rating"] for element in returned_dictionary[review][call]: second_level=json_normalize(element) second_level['review']=review second_level['rating']=rating df=pd.concat([df,second_level]) return df
def j2c_spc(json_file_path): print json_file_path json_file = open(json_file_path) ''' json_data = re.sub(r'}{','}\,{',json_file.read()) data = '[' + json_data + ']' json_data = json.loads(data) print json_normalize(json_data) ''' df = pd.DataFrame() for line in json_file: jsonobj_list = line.split('}{') for jsonobj in jsonobj_list: if re.search(r'^\{',jsonobj): jsonobj = jsonobj + '}' jsonrsp = json.loads(jsonobj)['response'] if len(jsonrsp) != 0: df = df.append(json_normalize(jsonrsp)) elif re.search(r'\}$',jsonobj): jsonobj = '{' + jsonobj #data.append(json.loads(jsonobj)['response']) jsonrsp = json.loads(jsonobj)['response'] if len(jsonrsp) != 0: df = df.append(json_normalize(jsonrsp)) else: jsonobj = '{' + jsonobj +'}' #data.append(json.loads(jsonobj)['response']) jsonrsp = json.loads(jsonobj)['response'] if len(jsonrsp) != 0: df = df.append(json_normalize(jsonrsp)) df.to_csv('./hos_rawdata/hos422/doclist.csv', encoding='utf8', mode='a+', index=False, columns=[ 'hospitalName', 'departmentName', 'doctorName', 'title', 'sex', 'specialty' ])
def propose_ad_thread(ids, runid): rs = [] headers = [5, 15, 35] adtypes = ['skyscraper', 'square', 'banner'] colors = ['green', 'blue', 'red', 'black', 'white'] productids = range(10, 25) # TODO create dataframe instead of proposing it on the fly for i in ids: rs.append(proposepage(i=i, runid=runid, header=random.choice(headers), adtype=random.choice(adtypes), color=random.choice(colors), productid=random.choice(productids), price=float(str(np.around(np.random.uniform(50), 2)))).json()) time.sleep(1) # TODO SAVE PRICE df = json_normalize(rs) df.columns = ['Error', 'Success'] df.to_csv("rewards" + str(runid) + "_" + str(ids[0]) + ".csv", index=False)
def import_data(): r = requests.get('http://www.citibikenyc.com/stations/json') df = json_normalize(r.json()['stationBeanList']) #take the string and parse it into a Python datetime object exec_time = parse(r.json()['executionTime']) exec_time = exec_time.strftime('%x-%X') return r, df, exec_time
def test_shallow_nested(self): data = [{'state': 'Florida', 'shortname': 'FL', 'info': { 'governor': 'Rick Scott' }, 'counties': [{'name': 'Dade', 'population': 12345}, {'name': 'Broward', 'population': 40000}, {'name': 'Palm Beach', 'population': 60000}]}, {'state': 'Ohio', 'shortname': 'OH', 'info': { 'governor': 'John Kasich' }, 'counties': [{'name': 'Summit', 'population': 1234}, {'name': 'Cuyahoga', 'population': 1337}]}] result = json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit', 'Cuyahoga'], 'state': ['Florida'] * 3 + ['Ohio'] * 2, 'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'], 'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2, 'population': [12345, 40000, 60000, 1234, 1337]} expected = DataFrame(ex_data, columns=result.columns) tm.assert_frame_equal(result, expected)
def gettweets(folder,dir_category): global features_list global features_df global row_count filename = "%s/%s/tweets.dump" % (dir_category,folder) jfile = codecs.open(filename,'rb','utf-8') break_counter =0 for jdoc in jfile: print("User: "******" - Tweets : ",row_count) ##break counter is tweet count if break_counter==100: break jvar = json.loads(jdoc) flat = flatten_json(jvar) norm=json_normalize(flat) features_df = pd.concat([features_df, norm], ignore_index=True) break_counter+=1 if "label" not in list(features_df): features_df['label'] = str("NA") features_df.loc[row_count,'label']=label features_df.fillna("NA",inplace=True) row_count+=1
def split_link_queries(df): """ df: pandas.DataFrame (insights_creatives_links merged) saves dataframe to be returned as csv returns: pandas.DataFrame (df with split link queries updated) """ import json # transfrom dataframe into json json_string = df.to_json(orient="records") # dataframe -> string json_list = json.loads(json_string) # string -> list # update split link queries for record in json_list: if record["link"]: url = record["link"] query = url[url.find("?") + 1 :] queries = query.split("&") # update each record with split link queries pairs = {} # pairs of query field & value for q in queries: pair = q.split("=") pairs[pair[0]] = pair[1] record.update(pairs) return json_normalize(json_list)
def myvariant_post(hgvs_list): ''' Query and Parser for myvariant.info Parses Raw Elastic Search results into pandas dataframe Parameters ------------- hgvs_list: list, required Output ------------- pandas df: normalized json of myvariant results ''' if type(hgvs_list) == list: hgvs_list = ','.join(hgvs_list) assert type(hgvs_list) == str con = mv.getvariants(hgvs_list, fields='dbnsfp.cadd.phred,dbnsfp.genename') mv_df = json_normalize(con) mv_df.index = mv_df['_id'] return mv_df
def instagram_scraper(query, n): url = '{0}/tags/{1}/media/recent?client_id={2}&count=30'.format( base_url, query, CLIENT_ID) urls = list() results = list() urls.append(str(url)) for _ in range(n): x = get(url) urls.append(str(x)) url = get(x) for url in urls: r = requests.get(url) j = r.json() if 'data in j': try: data = j['data'] df_instance = json_normalize(data) results.append(df_instance) except Exception, e: return 'Error: Could not find data.', str(e)
def get_historical_prices(symbols, start_date=None, end_date=None): """ Pulls historical prices for a given stock symbols and a given interval. Returns only about 500+ lines :param symbol: stock symbols list :param start_date: in 'yyyy-mm-dd' :param end_date: in 'yyyy-mm-dd' :return: modifies query result from json to pandas dataframe """ if type(symbols) is str: symbol_string = symbols else: symbol_string = "','".join(symbols) end_date = dateutil.parser.parse(end_date) if end_date else date.today() start_date = dateutil.parser.parse(start_date) if start_date else end_date - dateutil.relativedelta.relativedelta(years = 1) print(str(start_date), type(start_date)) print(str(end_date), type(start_date)) yql_query = "SELECT * FROM {0} WHERE symbol IN ('{1}') AND startDate ='{2}' AND endDate ='{3}'".format(FINANCE_TABLES['history'], symbol_string, str(start_date), str(end_date)) print(yql_query) query_result = execute_yql_query(yql_query) df = json_normalize(query_result['query']['results']['quote']) df.columns = [col_name.lower() for col_name in df.columns] return df
def format_prices(self, prices, flag_calc_spread=True): """Format prices data as a DataFrame with hierarchical columns""" def cols(typ): return({ 'openPrice.%s' % typ: 'Open', 'highPrice.%s' % typ: 'High', 'lowPrice.%s' % typ: 'Low', 'closePrice.%s' % typ: 'Close', 'lastTradedVolume': 'Volume' }) df = json_normalize(prices) df = df.set_index('snapshotTime') df.index.name = 'DateTime' df_ask = df[['openPrice.ask', 'highPrice.ask', 'lowPrice.ask', 'closePrice.ask']] df_ask = df_ask.rename(columns=cols('ask')) df_bid = df[['openPrice.bid', 'highPrice.bid', 'lowPrice.bid', 'closePrice.bid']] df_bid = df_bid.rename(columns=cols('bid')) if flag_calc_spread: df_spread = df_ask - df_bid df_last = df[['openPrice.lastTraded', 'highPrice.lastTraded', 'lowPrice.lastTraded', 'closePrice.lastTraded', 'lastTradedVolume']] df_last = df_last.rename(columns=cols('lastTraded')) if not flag_calc_spread: df2 = pd.concat([df_bid, df_ask, df_last], axis=1, keys=['bid', 'ask', 'last']) else: df2 = pd.concat([df_bid, df_ask, df_spread, df_last], axis=1, keys=['bid', 'ask', 'spread', 'last']) return(df2)
def handle_twitter(): #Return data from Twitter ACCESS_TOKEN = '759067855123996673-SMh5suAmoGjFjLe9uGnT8kDjBAdygkJ' ACCESS_SECRET = 'mXd44Jg5QOkhKmO310ex4Zwabe6wEeApZnC2YEuKdHZVz' CONSUMER_KEY = 'pUIwbWWj9nqjQNRU4mioXHnCJ' CONSUMER_SECRET = 'ukObCLCVITbL1biri3jheZHsoVeq5iLVplKcsUa1EeczKB8d2G' #Read train data with sentiment positive and negative oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET) twitter_stream = TwitterStream(auth=oauth) twitter = Twitter(auth=oauth) x = [] for i in range(0,1): #Read tweets with Hillay Clinton statuses iterator = twitter_stream.statuses.filter(track="@HillaryClinton,", language = "en") x.append(iterator) #iterator = twitter.search.tweets(q='HillaryClinton', lang='en', count=10000) collectobj = [] #Create a Data Frame from Json object with coloumns as id, User_name, Tweet_test, Location, probability for iterator in x: for tweet in iterator: if 'user' in tweet.keys() and tweet['user']['location']: #Get probability of a tweet being positive, negative or neutral topic = getprobtop(tweet['text']) obj = {'id' : tweet['user']['id'], 'User_name' : tweet['user']['screen_name'], \ 'Text': tweet['text'],'location' : tweet['user']['location'], \ 'status' : topic} print("Returning from prob") collectobj.append(obj) table = json_normalize(collectobj) return(table)
def get_nosource_files_info(block): ''' Seerch replicas for a given block and return the files that have no replica at all "[]" in replica field. The returned value is a dictionary having a panda data frame with the metainfo of the block, the count of non source files and the total number of files for the block Only the files with a creation time of more than 1 week are reported ''' url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/filereplicas' params = {"block": block} replicas_info = requests.get(url=url, params=params, verify=False).content replicas_json = json.loads(replicas_info) replicas_table = json_normalize( replicas_json['phedex']['block'][0]['file']) # Discards row entries of files with a creation date of one week or less replicas_table = replicas_table[replicas_table['time_create'].apply( check_datetime_Xweeks_older, nweeks=1) == True] num_files_in_block = len(replicas_table) no_source_files_table = replicas_table.loc[ replicas_table.astype(str)['replica'] == "[]"] num_nosource_files_in_block = len(no_source_files_table) return {'df': no_source_files_table, 'num_files_in_block': num_files_in_block, 'num_nosource_files_in_block': num_nosource_files_in_block}
def mongo_to_dataframe(mongo_data): sanitized = json.loads(json_util.dumps(mongo_data)) normalized = json_normalize(sanitized) df = pd.DataFrame(normalized) return df
def test_missing_meta(self, missing_metadata): # GH25468 # If metadata is nullable with errors set to ignore, the null values # should be numpy.nan values result = json_normalize( data=missing_metadata, record_path='addresses', meta='name', errors='ignore') ex_data = [ {'city': 'Massillon', 'number': 9562, 'state': 'OH', 'street': 'Morris St.', 'zip': 44646, 'name': 'Alice'}, {'city': 'Elizabethton', 'number': 8449, 'state': 'TN', 'street': 'Spring St.', 'zip': 37643, 'name': np.nan} ] ex_data = [ ['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'], ['Elizabethton', 8449, 'TN', 'Spring St.', 37643, np.nan] ] columns = ['city', 'number', 'state', 'street', 'zip', 'name'] expected = DataFrame(ex_data, columns=columns) tm.assert_frame_equal(result, expected)
def get_urls(url, n): #return a next_url def get(url): return str(requests.get(url).json()['pagination']['next_url']) #open list to hold urls urls = list() #handling initial url urls.append(str(url)) #add initial url to list #handling further urls for _ in range(n): x = get(url) urls.append(str(x)) #add next_url url = get(x) #replaces initial url with next_url for next turn in loop #open list to hold data results = list() #populate df with data from urls in urls for url in urls: results.append(json_normalize(requests.get(url).json()['data'])) #initiate df df = pd.DataFrame().append(results).reset_index().drop('index',axis=1)
def test_meta_name_conflict(self): data = [{'foo': 'hello', 'bar': 'there', 'data': [{'foo': 'something', 'bar': 'else'}, {'foo': 'something2', 'bar': 'else2'}]}] msg = (r"Conflicting metadata name (foo|bar)," " need distinguishing prefix") with pytest.raises(ValueError, match=msg): json_normalize(data, 'data', meta=['foo', 'bar']) result = json_normalize(data, 'data', meta=['foo', 'bar'], meta_prefix='meta') for val in ['metafoo', 'metabar', 'foo', 'bar']: assert val in result
match_id_required = 22912 home_team_required = "Tottenham Hotspur" away_team_required = "Liverpool" # Load in the data and match events file_name = str(match_id_required) + '.json' import json with open('Statsbomb/data/events/' + file_name) as data_file: #print (mypath+'events/'+file) data = json.load(data_file) #Get the nested structure into a dataframe from pandas.io.json import json_normalize df = json_normalize(data, sep="_").assign(match_id=file_name[:-5]) #A dataframe of shots shots = df.loc[df['type_name'] == 'Shot'].set_index('id') #Draw the pitch from FCPython import createPitch (fig, ax) = createPitch(pitchLengthX, pitchWidthY, 'yards', 'gray') #Plot the shots for i, shot in shots.iterrows(): x = shot['location'][0] y = shot['location'][1] goal = shot['shot_outcome_name'] == 'Goal' team_name = shot['team_name']
#trade from-to import json import inquirer import numpy as np import pandas as pd from pandas.io.json import json_normalize from binance.client import Client from datetime import datetime client = Client('API_KEY', 'API_SECRET') trades = client.get_all_orders(symbol='BTCUSDT') trades = json_normalize(trades) trades['data'] = pd.to_datetime(trades['time'], unit='ms') trades['updateTime'] = pd.to_datetime(trades['updateTime'], unit='ms') trades = trades[['time','data','price','side','status']] trades = trades[trades.status != 'CANCELED'] trades['price'] = trades['price'].astype(float) trades['side'] = trades['side'].astype(str) trades = trades.reset_index(drop=True) column = pd.Series(index = range(0,len(trades))) for i in range(len(trades)): if trades['side'][i] == 'SELL': column[i] = round((trades['price'][i] - trades['price'][i-1])*100/trades['price'][i-1],2) else: column[i] = '0' trades['perc'] = column
url = "https://covid-19-coronavirus-statistics.p.rapidapi.com/v1/stats" querystring = {"country": "US"} headers = { 'x-rapidapi-host': "covid-19-coronavirus-statistics.p.rapidapi.com", 'x-rapidapi-key': "2d81c24244mshffe53b231648b51p1450f7jsn88b6e0713f7e" } response = requests.get(url, headers=headers, params=querystring) response.json() json_res = response.json() df1 = json_normalize(json_res, ['data', 'covid19Stats']) st.title('COVID19 Data (via API)') st.subheader('Total Deaths:') st.write(sum(df1.deaths)) st.subheader('Total Confirmed') st.write(sum(df1.confirmed)) if st.checkbox('Show Data'): st.write(df1, height=1000, length=1000) if st.checkbox('deaths'): c = alt.Chart(df1, width=1000, height=1000).mark_bar(clip=True).encode(x='province',
def TradeFlow(date_input): log_pnl_filename = "LogPnlRisk_" + date_input.strftime("%Y%m%d") + ".json" bt_pnl_filename = "ProfitRiskSeq_" + date_input.strftime( "%Y%m%d") + ".json" os.chdir("C:/Users/Yitong/AppData/Local/auto-option-mm/trades") if not path.exists(log_pnl_filename): return with open(log_pnl_filename) as json_file: cur_json = json.load(json_file) log_DF = json_normalize(cur_json) log_DF['timestamp'] = pd.to_datetime(log_DF['Pnl.Timestamp']) log_DF = log_DF.set_index('timestamp') if not path.exists(bt_pnl_filename): return with open(bt_pnl_filename) as json_file: cur_json = json.load(json_file) bt_DF = json_normalize(cur_json) bt_DF['timestamp'] = pd.to_datetime(bt_DF['Pnl.Timestamp']) bt_DF = bt_DF.set_index('timestamp') log_DF['Vega_norm'] = log_DF['Risk.Vega'] / log_DF['Profit.Volume'][-1] bt_DF['Vega_norm'] = bt_DF['Risk.Vega'] / bt_DF['Profit.Volume'][-1] log_DF['CD_norm'] = log_DF['Risk.CashDelta'] / log_DF['Profit.Volume'][-1] bt_DF['CD_norm'] = bt_DF['Risk.CashDelta'] / bt_DF['Profit.Volume'][-1] plt.close() os.chdir("C:/Users/Yitong/AppData/Local/auto-option-mm/trades") fig, ax1 = plt.subplots() color = 'tab:red' ax1.set_xlabel('time') ax1.set_ylabel('Vega/VolumeTot', color=color) ax1.plot(log_DF['Vega_norm'], color='deepskyblue', label='Log') ax1.plot(bt_DF['Vega_norm'], color='crimson', label='BackTest') ax1.set_title("VegaFlow " + str(date_input)) ax1.legend(loc='lower right') ax1.tick_params(axis='y', labelcolor=color) ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis color = 'tab:blue' ax2.set_ylabel('Vol', color='black') # we already handled the x-label with ax1 ax2.plot(log_DF['Risk.AtmVol'], color='black', label='FitVol') # ax2.plot(TR_DF['Risk.AtmVol'], color='gold', label = 'TRVol') ax2.legend(loc='upper right') ax2.tick_params(axis='y', labelcolor='black') fig.tight_layout() # otherwise the right y-label is slightly clipped plt.show() plt.savefig("VegaFlow_" + str(date_input).replace("-", "") + ".png") plt.close() os.chdir("C:/Users/Yitong/AppData/Local/auto-option-mm/trades") fig, ax1 = plt.subplots() color = 'tab:red' ax1.set_xlabel('time') ax1.set_ylabel('CashDelta/VolumeTot', color=color) ax1.plot(log_DF['CD_norm'], color='deepskyblue', label='Log') ax1.plot(bt_DF['CD_norm'], color='crimson', label='BackTest') ax1.set_title("CashDeltaFlow " + str(date_input)) ax1.legend(loc='lower right') ax1.tick_params(axis='y', labelcolor=color) ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis color = 'tab:blue' ax2.set_ylabel('Vol', color='black') # we already handled the x-label with ax1 ax2.plot(log_DF['Risk.Spot'], color='black', label='SpotPrice') # ax2.plot(TR_DF['Risk.AtmVol'], color='gold', label = 'TRVol') ax2.legend(loc='upper right') ax2.tick_params(axis='y', labelcolor='black') fig.tight_layout() # otherwise the right y-label is slightly clipped plt.show() plt.savefig("CashDeltaFlow_" + str(date_input).replace("-", "") + ".png") return
#pass #print(json.loads(response.text)) response = session.post('https://direct.pinpoll.com/v2/vote', headers=headers, data=data) r = json.loads(response.text) r["timestamp"] = datetime.now() results.append(r) print(r["result"][-1]["votes"]) except Exception as e: print(e) #print(response.text) t.sleep(2) #Don't set this to zero, we dont want to DDOS the server. data data = json_normalize(results) votes = data["result"].apply( lambda x: pd.DataFrame(x).transpose().rename(columns={ 0: "384232", 1: "384233", 2: "384234", 3: "384235" }).drop("id")) votes = pd.concat(votes.to_list()) data = pd.concat([data, votes.reset_index(drop=True)], axis=1) data.set_index("timestamp")[['384232', '384233', '384234', '384235']].plot() data.to_csv("data.csv") data[data["timestamp"] < "2020-05-23"][[ '384232', '384233', '384234', '384235' ]].diff().sum() data[data["timestamp"] >= "2020-05-23"][[
def load_json_bq(batch): #print(datetime.utcnow()) #print(batch) json_data = [] for f in batch.split( '|'): #this is what would happen for each file in a batch!!!! success = 0 try_count = 1 while success == 0 and try_count <= 5: try: blob = BUCKET.get_blob(f) json_data.extend(json.loads(blob.download_as_string())) success = 1 except Exception as e: if try_count == 5: #on last try iteration exit function with empty list db_logger.error('Batch Failed: {}'.format(batch)) f_logger.error('Batch Failed: {}'.format(batch), exc_info=True, stack_info=True) return time.sleep(1 + try_count) try_count += 1 #get batch timestamp: batch_dt = datetime.now() #Load to BQ: #customers table: table_name = 'customers' table_columns = [ 'accepts_marketing', 'admin_graphql_api_id', 'created_at', 'currency', 'email', 'first_name', 'id', 'last_name', 'last_order_id', 'last_order_name', 'multipass_identifier', 'note', 'orders_count', 'phone', 'state', 'tags', 'tax_exempt', 'total_spent', 'updated_at', 'verified_email' ] df = json_normalize(json_data) if not df.empty: #format column names: df.columns = [x.strip().replace('.', '_') for x in df.columns] #only selected columns: df_schema = pd.DataFrame( df, columns=table_columns) #df.loc[:,table_columns] df_schema['ods_inserted_at'] = batch_dt del df #convert datatypes: df_schema['accepts_marketing'] = df_schema['accepts_marketing'].astype( 'O') df_schema['admin_graphql_api_id'] = df_schema[ 'admin_graphql_api_id'].astype('O') df_schema['currency'] = df_schema['currency'].astype('O') df_schema['email'] = df_schema['email'].astype('O') df_schema['first_name'] = df_schema['first_name'].astype('O') df_schema['last_name'] = df_schema['last_name'].astype('O') df_schema['last_order_name'] = df_schema['last_order_name'].astype('O') df_schema['multipass_identifier'] = df_schema[ 'multipass_identifier'].astype('O') df_schema['note'] = df_schema['note'].astype('O') df_schema['phone'] = df_schema['phone'].astype('O') df_schema['state'] = df_schema['state'].astype('O') df_schema['tags'] = df_schema['tags'].astype('O') df_schema['tax_exempt'] = df_schema['tax_exempt'].astype('O') df_schema['verified_email'] = df_schema['verified_email'].astype('O') df_schema['id'] = df_schema['id'].astype('int') df_schema['created_at'] = pd.to_datetime(df_schema['created_at']) df_schema['updated_at'] = pd.to_datetime(df_schema['updated_at']) df_schema['orders_count'] = df_schema['orders_count'].fillna(0).astype( 'int') df_schema['last_order_id'] = df_schema['last_order_id'].fillna( 0).astype('int') df_schema['total_spent'] = df_schema['total_spent'].fillna(0).astype( 'float64') success = 0 try_count = 1 while success == 0 and try_count <= 5: try: #upload data to table: df_schema.to_gbq('{}.{}'.format(DATASET_ID, table_name), PROJECT_ID, chunksize=None, if_exists='append', private_key=SERVICE_ACCOUNT_KEY_FILE) success = 1 except Exception as e: if try_count == 5: #on last try iteration exit function with empty list db_logger.error('Table {} load failed: batch - {}'.format( table_name, batch)) f_logger.error('Table {} load failed: batch - {}'.format( table_name, batch), exc_info=True, stack_info=True) return time.sleep(1 + try_count) try_count += 1 #customer_address table: table_name = 'customer_address' table_columns = [ 'id', 'customer_id', 'customer_updated_at', 'first_name', 'last_name', 'address1', 'address2', 'city', 'company', 'country', 'country_code', 'country_name', 'province', 'province_code', 'zip', 'phone', 'name', 'default' ] df = json_normalize(json_data, 'addresses', ['updated_at'], meta_prefix='customer_') if not df.empty: #format column names: df.columns = [x.strip().replace('.', '_') for x in df.columns] #only selected columns: df_schema = pd.DataFrame( df, columns=table_columns) #df.loc[:,table_columns] df_schema['ods_inserted_at'] = batch_dt del df #convert column datatypes: df_schema['first_name'] = df_schema['first_name'].astype('O') df_schema['last_name'] = df_schema['last_name'].astype('O') df_schema['address1'] = df_schema['address1'].astype('O') df_schema['address2'] = df_schema['address2'].astype('O') df_schema['city'] = df_schema['city'].astype('O') df_schema['company'] = df_schema['company'].astype('O') df_schema['country'] = df_schema['country'].astype('O') df_schema['country_code'] = df_schema['country_code'].astype('O') df_schema['country_name'] = df_schema['country_name'].astype('O') df_schema['province'] = df_schema['province'].astype('O') df_schema['province_code'] = df_schema['province_code'].astype('O') df_schema['zip'] = df_schema['zip'].astype('O') df_schema['phone'] = df_schema['phone'].astype('O') df_schema['name'] = df_schema['name'].astype('O') df_schema['id'] = df_schema['id'].astype('int') df_schema['customer_id'] = df_schema['customer_id'].astype('int') df_schema['customer_updated_at'] = pd.to_datetime( df_schema['customer_updated_at']) df_schema['default'] = df_schema['default'].astype('bool') success = 0 try_count = 1 while success == 0 and try_count <= 5: try: #upload data to table: df_schema.to_gbq('{}.{}'.format(DATASET_ID, table_name), PROJECT_ID, chunksize=None, if_exists='append', private_key=SERVICE_ACCOUNT_KEY_FILE) success = 1 except Exception as e: if try_count == 5: #on last try iteration exit function with empty list db_logger.error('Table {} load failed: batch - {}'.format( table_name, batch)) f_logger.error('Table {} load failed: batch - {}'.format( table_name, batch), exc_info=True, stack_info=True) return time.sleep(1 + try_count) try_count += 1 #print('{} loaded'.format(table_name)) #rename processed files: for f in batch.split( '|'): #this is what would happen for each file in a batch!!!! new_name = '{}/{}'.format(PROCCESSED_PREFIX, f.split('/')[-1]) success = 0 try_count = 1 while success == 0 and try_count <= 5: try: blob = BUCKET.get_blob(f) BUCKET.rename_blob(blob, new_name, client=CLIENT) success = 1 except Exception as e: if try_count == 5: #on last try iteration exit function with empty list db_logger.error( 'Blob {} rename failed: batch - {}, '.format(f, batch)) f_logger.error( 'Blob {} rename failed: batch - {}, '.format(f, batch), exc_info=True, stack_info=True) return time.sleep(1 + try_count) try_count += 1
def json_to_df(filename): df_enriched = json_normalize(pd.Series(open(filename).readlines()).apply(json.loads)) return df_enriched
def update_database(): print('Fetching and updating 1') apiResponse = get('https://api.covid19india.org/raw_data.json') # print("Processes", apiResponse.status_code) #print(apiResponse) if (apiResponse.status_code == 200): raw_data = apiResponse.json() raw_data = raw_data['raw_data'] # JSON to dataframe data = json_normalize(raw_data) data = data.rename( columns={ "patientnumber": "ID", "statepatientnumber": "Government id", "dateannounced": "Diagnosed date", "agebracket": "Age", "gender": "Gender", "detectedcity": "Detected city", "detecteddistrict": "Detected district", "detectedstate": "Detected state", "nationality": "Nationality", "currentstatus": "Current status", "statuschangedate": "Status change date", "_d180g": "Notes", "backupnotes": "Backup notes", "contractedfromwhichpatientsuspected": "Contracted from which Patient (Suspected)", "estimatedonsetdate": "Estimated on set date", "source1": "Source 1", "source2": "Source 2", "source3": "Source 3" }) # changing nationality Indian to India for ind in data.index: if (data['Nationality'][ind] == "Indian"): data['Nationality'][ind] = "India" # converting the string values to datetime object data['Diagnosed date'] = pd.to_datetime(data['Diagnosed date'], dayfirst=True) data['Status change date'] = pd.to_datetime(data['Status change date'], dayfirst=True) # replacing all the missing values with unknown data.replace(to_replace="", value="unknown", inplace=True) # creating new columns depicting the current status of patient data['recovered'] = 0 data['active'] = 0 data['death'] = 0 data['unknown'] = 0 data['confirmed'] = 1 for status in data.index: if (data['Current status'][status] == "Hospitalized"): data['active'][status] = 1 elif (data['Current status'][status] == "Recovered"): data['recovered'][status] = 1 elif (data['Current status'][status] == "Deceased"): data['death'][status] = 1 else: data['unknown'][status] = 1 data.to_csv(file_loc + './data/data.csv', index=False, date_format="%Y-%m-%d %H:%M:%S") #print( 'raw data complete' ) else: print("Connection error")
def create_issues_df(owner, repo, api): issues_list = issues_of_repo_github(owner, repo, api) return json_normalize(issues_list)
from urllib.request import urlopen import json from pandas.io.json import json_normalize import pandas as pd, numpy as np from bs4 import BeautifulSoup as bs with open("epamglobal.txt", 'r') as f: links = f.readlines() result = pd.DataFrame() for i in range(len(links)): try: page = urlopen(links[i]).read() data = bs(page, 'html.parser') body = data.find('body') script = body.find('script') raw = script.text.strip().replace('window._sharedData =', '').replace(';', '') json_data = json.loads(raw) posts = json_data['entry_data']['PostPage'][0]['graphql'] posts = json.dumps(posts) posts = json.loads(posts) x = pd.DataFrame.from_dict(json_normalize(posts), orient='columns') x.columns = x.columns.str.replace("shortcode_media.", "") result = result.append(x) except: np.nan result = result.drop_duplicates(subset='shortcode') result.index = range(len(result.index))
def test_more_deeply_nested(self): data = [{ 'country': 'USA', 'states': [{ 'name': 'California', 'cities': [{ 'name': 'San Francisco', 'pop': 12345 }, { 'name': 'Los Angeles', 'pop': 12346 }] }, { 'name': 'Ohio', 'cities': [{ 'name': 'Columbus', 'pop': 1234 }, { 'name': 'Cleveland', 'pop': 1236 }] }] }, { 'country': 'Germany', 'states': [{ 'name': 'Bayern', 'cities': [{ 'name': 'Munich', 'pop': 12347 }] }, { 'name': 'Nordrhein-Westfalen', 'cities': [{ 'name': 'Duesseldorf', 'pop': 1238 }, { 'name': 'Koeln', 'pop': 1239 }] }] }] result = json_normalize(data, ['states', 'cities'], meta=['country', ['states', 'name']]) # meta_prefix={'states': 'state_'}) ex_data = { 'country': ['USA'] * 4 + ['Germany'] * 3, 'states.name': [ 'California', 'California', 'Ohio', 'Ohio', 'Bayern', 'Nordrhein-Westfalen', 'Nordrhein-Westfalen' ], 'name': [ 'San Francisco', 'Los Angeles', 'Columbus', 'Cleveland', 'Munich', 'Duesseldorf', 'Koeln' ], 'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239] } expected = DataFrame(ex_data, columns=result.columns) tm.assert_frame_equal(result, expected)
# In[12]: len(df3.id.unique()) #df3.id.unique()[1:3] # In[17]: detalhe_deputados = [] for id in df3.id.unique(): #df3.id.unique()[1:3]: print(id) #Detalhe do deputado: request_detalhe = requests.get( 'https://dadosabertos.camara.leg.br/api/v2/deputados/{id!s}'.format( id=id)) json_detalhe = json.loads(request_detalhe.text) df_detalhado = json_normalize(json_detalhe['dados']) detalhe_deputados.append(df_detalhado) #Solicitação de reembolso do deputado: #vai ser aqui.... # In[20]: dicionario_deputados_detalhe = concat(detalhe_deputados, ignore_index=False, sort=True) dicionario_deputados_detalhe # In[21]: print(engine.table_names())
def __call__(self, query_date, grafana_dict, idb_dict, tag): GRAFANA_HOST = grafana_dict['GRAFANA_HOST'] GRAFANA_REQUEST_ANNO_QUERY = '/api/annotations' GRAFANA_USERNAME = grafana_dict['GRAFANA_USERNAME'] GRAFANA_PASSWORD = grafana_dict['GRAFANA_PASSWORD'] GRAFANA_FROM = query_date['DATE_FROM'] + '000' GRAFANA_TO = query_date['DATE_TO'] + '000' GRAFANA_TAG1 = grafana_dict['GRAFANA_TAG1'] GRAFANA_TAG2 = grafana_dict['GRAFANA_TAG2'] #GRAFANA_PANEL_ID = '16' #GRAFANA_DASHBOARD_ID = '15' IDB_HOST = idb_dict['IDB_HOST'] IDB_PORT = idb_dict['IDB_PORT'] IDB_DBNAME = idb_dict['IDB_DBNAME'] IDB_CHANNEL = tag IDB_USER = idb_dict['IDB_USER'] IDB_PASSWORD = idb_dict['IDB_PASSWORD'] KEYWORD = '' def read_influxdb_data(host='192.168.123.245', port=8086, dbname='c9377a95-82f3-4af3-ac14-40d14f6d2abe', ChannelName='1Y520210100', time_start='', time_end='', user='******', password='******', keyword=''): client = DataFrameClient(host, port, user, password, dbname) measurements = client.get_list_measurements() if keyword is None: keyword = '' if keyword == '': measurement = [ mea.get(u'name') for mea in measurements if mea.get(u'name').find(ChannelName) >= 0 ] else: measurement = [ mea.get(u'name') for mea in measurements if mea.get(u'name').find(ChannelName) >= 0 and mea.get(u'name').find(keyword) >= 0 ] if len(measurement) == 0: print('No data retrieved.') return None measurement = measurement[-1] time_end = 'now()' if time_end == '' else "'" + time_end + "'" time_start = 'now()' if time_start == '' else "'" + time_start + "'" querystr = 'select * from "{}" where time > {} and time < {}'.format( measurement, time_start, time_end) #print(querystr) df = client.query(querystr).get(measurement) client.close() if df is None: print('InfluxDB no data retrieved.') return None dff = df.groupby('id') columns = [name for name, group in dff] groups = [group['val'] for name, group in dff] #check datatime alginment: all([all(groups[i].index==groups[0].index) for i in range(1,len(groups))]) result = pd.concat(groups, axis=1) result.columns = columns result.index = groups[0].index return measurement, result def encode_base64(username, password): str_user = username + ':' + password str_user_byte = str_user.encode('utf8') # string to byte str_user_encode64 = base64.b64encode( str_user_byte) # encode by base64 str_user_string = str_user_encode64.decode( 'utf8') # byte to string str_auth = 'Basic ' + str(str_user_string) return str_auth ## Request Annotation list from Grafana ## headers = { "Accept": "application/json", "Content-Type": "application/json", "Authorization": encode_base64(GRAFANA_USERNAME, GRAFANA_PASSWORD) } url = GRAFANA_HOST + GRAFANA_REQUEST_ANNO_QUERY + '?' +\ '&tags=' + GRAFANA_TAG1 +\ '&tags=' + GRAFANA_TAG2 +\ '&from=' + GRAFANA_FROM +\ '&to=' + GRAFANA_TO print(url) print(headers) req = requests.get(url, headers=headers) req_data_json = req.json() req_data_pd = json_normalize(req_data_json) #print (req_data_pd) # which means grafana retrieve no data from API #if 'timestamp' not in req_data_pd.index: # return 'no data retrieve from Grafana' #GMT+8 annotation = req_data_pd[['regionId', 'tags', 'time', 'email']] annotation = annotation.sort_values(by=['regionId', 'time']) annotation['time'] = pd.to_datetime(annotation['time'], unit='ms') annotation.rename(index=str, columns={'time': 'timestamp'}, inplace=True) # Remove in-duplicate rows anno_dup_list = annotation.set_index('regionId').index.get_duplicates() annotation = annotation.loc[annotation['regionId'].isin(anno_dup_list)] ## Request SCADA eigen value from InfluxDB ## #scada_idb = pd.read_csv('test.csv') #scada_idb['Unnamed: 0'] = scada_idb['Unnamed: 0'].astype(str).str[:-6] #scada_idb.rename(index=str, columns={'Unnamed: 0': 'timestamp'}, inplace=True) #cada_idb = scada_idb.sort_values(by=['timestamp']) IDB_TIME_START = datetime.datetime.fromtimestamp( int(query_date['DATE_FROM'])).strftime('%Y-%m-%d %H:%M:%S') IDB_TIME_END = datetime.datetime.fromtimestamp( int(query_date['DATE_TO'])).strftime('%Y-%m-%d %H:%M:%S') measurement, scada_idb = read_influxdb_data(host=IDB_HOST, port=IDB_PORT, dbname=IDB_DBNAME, ChannelName=IDB_CHANNEL, time_start=IDB_TIME_START, time_end=IDB_TIME_END, user=IDB_USER, password=IDB_PASSWORD) scada_idb['timestamp'] = scada_idb.index scada_idb['timestamp'] = scada_idb['timestamp'].astype(str).str[:-6] # which means SCADA retrieve no data from InfluxDB if len(scada_idb['timestamp']) == 0: return 'no data retrieve from SCADA' ## Align SCADA and Grafana Dataframe ## label_df = pd.DataFrame() for regionID in annotation.regionId.unique(): tags_list = annotation[annotation['regionId'] == regionID]['tags'].iloc[0] mail = annotation[annotation['regionId'] == regionID]['email'].iloc[0] label_start_time = str(annotation[annotation['regionId'] == regionID]['timestamp'].iloc[0]) label_end_time = str(annotation[annotation['regionId'] == regionID] ['timestamp'].iloc[1]) label_start_time = datetime.datetime.strptime( label_start_time, '%Y-%m-%d %H:%M:%S') label_end_time = datetime.datetime.strptime( label_end_time, '%Y-%m-%d %H:%M:%S') # loop scada_idb, if timestamp + 8 hours + 6 days(testing) > start and < end, set value of tags for i in range(len(scada_idb['timestamp'])): datetime_object = datetime.datetime.strptime( scada_idb['timestamp'][i], '%Y-%m-%d %H:%M:%S') #datetime_object = datetime_object + datetime.timedelta(hours=8) + datetime.timedelta(days=7) #label_df.at[i, tags]= 0 label_df.at[i, 'timestamp'] = scada_idb['timestamp'][i] #if (datetime_object > label_start_time) and (datetime_object < label_end_time): # for num, tags in enumerate(tags_list): # #print (tags) # label_df.at[i, tags] = 1 # label_df.at[i, mail] = 1 #else: # for num, tags in enumerate(tags_list): # #print (tags) # label_df.at[i, tags] = -1 # label_df.at[i, mail] = -1 if (datetime_object > label_start_time) and (datetime_object < label_end_time): for num, tags in enumerate(tags_list): #print (tags) label_df.at[i, tags] = 1 label_df.at[i, mail] = 1 #print (label_start_time, label_end_time, datetime_object) #label_df.drop('x', axis=1, inplace=True) output_df = pd.merge(scada_idb, label_df.fillna(-1), on=['timestamp']) output_df.drop([GRAFANA_TAG2], axis=1, inplace=True) return output_df
#Database and Container names - should be consistent across runs database_name = '<database-name>' container_name = '<container-name>' #Create the Cosmos client; Connect to database and container of interest client = cosmos_client.CosmosClient(url, key) database = client.get_database_client(database_name) container = database.get_container_client(container_name) # Execute query to pull in all JSON items and all fields into single DataFrame: output = pd.DataFrame() for item in container.query_items( query='SELECT * FROM c where startswith(c.id, "' + WF_ID + '") AND c.state = "COMPLETE"', enable_cross_partition_query=True): flat = json_normalize(fj.flatten_json(item)) output = output.append(flat, ignore_index=True) # print(output) # Create new DataFrame with only fields of interest of_interest = output[[ "id", "state", "name", "description", "resources_cpu_cores", "resources_preemptible", "resources_ram_gb", "resources_disk_gb", # Headers for WGS runs: 'logs_0_logs_0_stdout',
def test_json_normalize_errors(self): # GH14583: If meta keys are not always present # a new option to set errors='ignore' has been implemented i = { "Trades": [{ "general": { "tradeid": 100, "trade_version": 1, "stocks": [{ "symbol": "AAPL", "name": "Apple", "price": "0" }, { "symbol": "GOOG", "name": "Google", "price": "0" }] } }, { "general": { "tradeid": 100, "stocks": [{ "symbol": "AAPL", "name": "Apple", "price": "0" }, { "symbol": "GOOG", "name": "Google", "price": "0" }] } }] } j = json_normalize(data=i['Trades'], record_path=[['general', 'stocks']], meta=[['general', 'tradeid'], ['general', 'trade_version']], errors='ignore') expected = { 'general.trade_version': { 0: 1.0, 1: 1.0, 2: '', 3: '' }, 'general.tradeid': { 0: 100, 1: 100, 2: 100, 3: 100 }, 'name': { 0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google' }, 'price': { 0: '0', 1: '0', 2: '0', 3: '0' }, 'symbol': { 0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG' } } self.assertEqual(j.fillna('').to_dict(), expected) self.assertRaises(KeyError, json_normalize, data=i['Trades'], record_path=[['general', 'stocks']], meta=[['general', 'tradeid'], ['general', 'trade_version']], errors='raise')
# pandas_Bin_result = json_normalize(json_Bin_result) # return pandas_Bin_result url = "https://api.binance.com" res = requests.get(url + "/api/v3/ticker/price", params={'symbol': "BTCUSDT"}) Bin_result = res.json() #print(type(Bin_result)) #print(Bin_result[0]['price']) #print(Bin_result[0]['symbol']) #print("--------------------여기는 Bin_result의 값입니다--------------------") #print(type(Bin_result['price'])) # Bin_result['price']는 string이므로 float 변환 binance_price = float(Bin_result['price']) Bin_result['price'] = binance_price str_Bin_result = json.dumps(Bin_result) json_Bin_result = json.loads(str_Bin_result) pandas_Bin_result = json_normalize(json_Bin_result) # 1. 열 추가 방법 # 1-1. 방법1 # city = ['32424'] # pandas_Bin_result['city'] = city # 1-2. 방법2 # df = df.assign(city = ['Lahore']) #print(df) #210406 #데이터 순서(symbol,price) #변수명 변경 완료 #데이터 불러오기 완료 #함수로 변환하는 작업 필요
if os.path.exists(arg1): l = os.listdir(arg1) for each_file in l: if each_file.endswith(".json"): print("Iteration") file_list.append(each_file) else: print("Not found") sys.exit() head = [] with open("22combine.json", "w") as outfile: for f in file_list: with open(f, 'rb') as infile: file_data = json.load(infile) head += file_data json.dump(head, outfile) outfile.close() with open("22combine.json", 'r') as file1: data1 = json.load(file1) data2 = json_normalize(data1) data2.to_csv("22result.csv", mode="a", index=False) file1.close()
'project_name']].groupby('countryname').agg('count') # In[107]: dataQ1Sorted = dataQ1.sort_values('project_name', ascending=False, inplace=False) # In[109]: dataQ1Sorted.head(10) # In[204]: #dataQ2 = data[['_id','mjtheme_namecode']] json_normalize(data, 'mjtheme_namecode') # In[197]: # In[187]: dataQ2['mjtheme_namecode'].array # In[181]: flatten(dataQ2) # In[154]: print(dataQ2['mjtheme_namecode'].head())
def get_variants_by(filter_by, search_term, dataset, mode, timeout=None): query_for_transcripts = """ { transcript(transcript_id: "%s", reference_genome: %s) { transcript_id, transcript_version, gene { gene_id, symbol, start, stop, strand, chrom, hgnc_id, gene_name, full_gene_name, omim_id } variants(dataset: %s) { pos rsid ref alt consequence genome { genome_af:af genome_ac:ac genome_an:an genome_ac_hemi:ac_hemi genome_ac_hom:ac_hom } exome { exome_af:af exome_ac:ac exome_an:an exome_ac_hemi:ac_hemi exome_ac_hom:ac_hom } flags lof consequence_in_canonical_transcript gene_symbol hgvsc lof_filter lof_flags hgvsc hgvsp reference_genome variant_id: variantId } gtex_tissue_expression{ adipose_subcutaneous, adipose_visceral_omentum, adrenal_gland, artery_aorta, artery_coronary, artery_tibial, bladder, brain_amygdala, brain_anterior_cingulate_cortex_ba24, brain_caudate_basal_ganglia, brain_cerebellar_hemisphere, brain_cerebellum, brain_cortex, brain_frontal_cortex_ba9, brain_hippocampus, brain_hypothalamus, brain_nucleus_accumbens_basal_ganglia, brain_putamen_basal_ganglia, brain_spinal_cord_cervical_c_1, brain_substantia_nigra, breast_mammary_tissue, cells_ebv_transformed_lymphocytes, cells_transformed_fibroblasts, cervix_ectocervix, cervix_endocervix, colon_sigmoid, colon_transverse, esophagus_gastroesophageal_junction, esophagus_mucosa, esophagus_muscularis, fallopian_tube, heart_atrial_appendage, heart_left_ventricle, kidney_cortex, liver, lung, minor_salivary_gland, muscle_skeletal, nerve_tibial, ovary, pancreas, pituitary, prostate, skin_not_sun_exposed_suprapubic, skin_sun_exposed_lower_leg, small_intestine_terminal_ileum, spleen, stomach, testis, thyroid, uterus, v****a, whole_blood } clinvar_variants{ variant_id, clinvar_variation_id, reference_genome, chrom, pos, ref, alt, clinical_significance, gold_stars, major_consequence, review_status } coverage(dataset: %s){ genome{ pos, mean, median, over_1, over_5, over_10, over_15, over_20, over_25, over_30, over_50, over_100 } exome{ pos, mean, median, over_1, over_5, over_10, over_15, over_20, over_25, over_30, over_50, over_100 } } gnomad_constraint{ exp_lof, exp_mis, exp_syn, obs_lof, obs_mis, obs_syn, oe_lof, oe_lof_lower, oe_lof_upper, oe_mis, oe_mis_lower, oe_mis_upper, oe_syn, oe_syn_lower, oe_syn_upper, lof_z, mis_z, syn_z, pLI, flags } exac_constraint{ exp_syn, exp_mis, exp_lof, obs_syn, obs_mis, obs_lof, mu_syn, mu_mis, mu_lof, syn_z, mis_z, lof_z, pLI } } } """ query_for_variants = """ { variant(%s: "%s", dataset: %s) { variantId reference_genome chrom pos ref alt colocatedVariants multiNucleotideVariants { combined_variant_id changes_amino_acids n_individuals other_constituent_snvs } exome { ac an ac_hemi ac_hom faf95 { popmax popmax_population } filters populations { id ac an ac_hemi ac_hom } age_distribution { het { bin_edges bin_freq n_smaller n_larger } hom { bin_edges bin_freq n_smaller n_larger } } qualityMetrics { alleleBalance { alt { bin_edges bin_freq n_smaller n_larger } } genotypeDepth { all { bin_edges bin_freq n_smaller n_larger } alt { bin_edges bin_freq n_smaller n_larger } } genotypeQuality { all { bin_edges bin_freq n_smaller n_larger } alt { bin_edges bin_freq n_smaller n_larger } } } } genome { ac an ac_hemi ac_hom faf95 { popmax popmax_population } filters populations { id ac an ac_hemi ac_hom } age_distribution { het { bin_edges bin_freq n_smaller n_larger } hom { bin_edges bin_freq n_smaller n_larger } } qualityMetrics { alleleBalance { alt { bin_edges bin_freq n_smaller n_larger } } genotypeDepth { all { bin_edges bin_freq n_smaller n_larger } alt { bin_edges bin_freq n_smaller n_larger } } genotypeQuality { all { bin_edges bin_freq n_smaller n_larger } alt { bin_edges bin_freq n_smaller n_larger } } } } flags rsid sortedTranscriptConsequences { canonical gene_id gene_version gene_symbol hgvs hgvsc hgvsp lof lof_flags lof_filter major_consequence polyphen_prediction sift_prediction transcript_id transcript_version } } } """ query_for_genes = """ { gene(%s: "%s", reference_genome: %s) { gene_id symbol start stop strand chrom hgnc_id gene_name symbol full_gene_name reference_genome omim_id canonical_transcript_id structural_variants(dataset: %s){ ac, ac_hom, an, af, reference_genome, chrom, chrom2, end, end2, consequence, filters, length, pos, pos2, type, variant_id } variants(dataset: %s) { pos rsid ref alt consequence genome { genome_af:af genome_ac:ac genome_an:an genome_ac_hemi:ac_hemi genome_ac_hom:ac_hom } exome { exome_af:af exome_ac:ac exome_an:an exome_ac_hemi:ac_hemi exome_ac_hom:ac_hom } flags lof consequence_in_canonical_transcript gene_symbol hgvsc lof_filter lof_flags hgvsc hgvsp reference_genome variant_id: variantId } mane_select_transcript{ ensembl_id ensembl_version refseq_id refseq_version } transcripts{ reference_genome gene_id transcript_id strand start stop chrom } exac_regional_missense_constraint_regions { start stop obs_mis exp_mis obs_exp chisq_diff_null } clinvar_variants { variant_id clinvar_variation_id reference_genome chrom pos ref alt clinical_significance gold_stars major_consequence review_status } coverage(dataset: %s) { exome { pos mean median over_1 over_5 over_10 over_15 over_20 over_25 over_30 over_50 over_100 } genome { pos mean median over_1 over_5 over_10 over_15 over_20 over_25 over_30 over_50 over_100 } } gnomad_constraint { exp_lof exp_mis exp_syn obs_lof obs_mis obs_syn oe_lof oe_lof_lower oe_lof_upper oe_mis oe_mis_lower oe_mis_upper oe_syn oe_syn_lower oe_syn_upper lof_z mis_z syn_z pLI flags } exac_constraint { exp_syn exp_mis exp_lof obs_syn obs_mis obs_lof mu_syn mu_mis mu_lof syn_z mis_z lof_z pLI } } } """ if filter_by == "transcript_id": query = query_for_transcripts % (search_term.upper(), reference_genome, dataset, dataset) elif filter_by == "rs_id": query = query_for_variants % ("rsid", search_term.lower(), dataset) elif filter_by == "gene_id": query = query_for_genes % ("gene_id", search_term.upper(), reference_genome, sv_dataset, dataset, dataset) elif filter_by == "gene_name": query = query_for_genes % ("gene_name", search_term.upper(), reference_genome, sv_dataset, dataset, dataset) else: print("Unknown `filter_by` type!") # Get repsonse global response response = requests.post(end_point, data={'query': query}, timeout=timeout) # Parse response if response.status_code == 200: st.markdown("---") st.subheader( "Outputs for `{}` is being prepared.".format(search_term)) st.markdown("\n") if filter_by == "transcript_id": if not os.path.exists('outputs/' + search_term + "/"): os.mkdir('outputs/' + search_term + "/") else: shutil.rmtree('outputs/' + search_term + "/") os.mkdir('outputs/' + search_term + "/") json_keys = list(response.json()["data"]["transcript"].keys()) for json_key in json_keys: if response.json( )["data"]["transcript"][json_key] is not None and type( response.json()["data"]["transcript"] [json_key]) not in [str, int]: data = json_normalize( response.json()["data"]["transcript"][json_key]) data.columns = data.columns.map(lambda x: x.split(".")[-1]) data.to_csv("outputs/" + search_term + "/" + json_key + ".tsv", sep="\t", index=False) if (len(data) > 0) and (mode == "single"): st.markdown("\n **Table for: `" + json_key + "`**") st.dataframe(data) elif filter_by == "rs_id": if not os.path.exists('outputs/' + search_term + "/"): os.mkdir('outputs/' + search_term + "/") else: shutil.rmtree('outputs/' + search_term + "/") os.mkdir('outputs/' + search_term + "/") json_keys = list(response.json()["data"]["variant"].keys()) general_info = "```" for json_key in json_keys: # print(json_key, type(response.json()["data"]["variant"][json_key])) # Basic info in `variant` part if response.json( )["data"]["variant"][json_key] is not None and type( response.json()["data"]["variant"][json_key]) in [ str, int ]: with open( "outputs/" + search_term + "/" + search_term + ".txt", "a") as f: f.write( "\n" + json_key + ":" + str(response.json()["data"]["variant"][json_key])) general_info += "\n" + json_key + ":" + str( response.json()["data"]["variant"][json_key]) # Other parts rather than `genome` and `exome` if response.json( )["data"]["variant"][json_key] is not None and type( response.json()["data"]["variant"][json_key]) not in [ str, int ] and json_key not in ["genome", "exome"]: data = json_normalize( response.json()["data"]["variant"][json_key]) data.columns = data.columns.map(lambda x: x.split(".")[-1]) data.to_csv("outputs/" + search_term + "/" + json_key + ".tsv", sep="\t", index=False) if (len(data) > 0) and (mode == "single"): st.markdown("\n **Table for: `" + json_key + "`**") st.dataframe(data) # Deep parsing for nested things in `genome` and `exome` if json_key in ["genome", "exome"]: for sub_json_key in list(response.json()["data"]["variant"] [json_key].keys()): # print(json_key, sub_json_key, type(response.json()["data"]["variant"][json_key][sub_json_key])) if response.json()["data"]["variant"][json_key][ sub_json_key] is not None and type( response.json()["data"]["variant"] [json_key][sub_json_key]) in [str, int]: with open( "outputs/" + search_term + "/" + search_term + ".txt", "a") as f: f.write("\n" + json_key + "_" + sub_json_key + ":" + str(response.json()["data"]["variant"] [json_key][sub_json_key])) general_info += "\n" + json_key + "_" + sub_json_key + ":" + str( response.json()["data"]["variant"] [json_key][sub_json_key]) if response.json()["data"]["variant"][json_key][ sub_json_key] is not None and type( response.json()["data"]["variant"] [json_key][sub_json_key]) not in [ str, int ]: data = json_normalize( response.json()["data"]["variant"][json_key] [sub_json_key]) data.columns = data.columns.map( lambda x: x.split(".")[-1]) data.to_csv("outputs/" + search_term + "/" + json_key + "_" + sub_json_key + ".tsv", sep="\t", index=False) if (len(data) > 0) and (mode == "single"): st.markdown("\n **Table for: `" + sub_json_key + "`**") st.dataframe(data) general_info += "```" if mode == "single": st.markdown("--- \n **General Info for your query**") st.info(general_info) elif filter_by == "gene_id": if not os.path.exists('outputs/' + search_term + "/"): os.mkdir('outputs/' + search_term + "/") else: shutil.rmtree('outputs/' + search_term + "/") os.mkdir('outputs/' + search_term + "/") json_keys = list(response.json()["data"]["gene"].keys()) general_info = "```" for json_key in json_keys: # print(json_key, type(response.json()["data"]["gene"][json_key]), response.json()["data"]["gene"][json_key] is None, type(response.json()["data"]["gene"][json_key]) not in [str, int]) if response.json( )["data"]["gene"][json_key] is not None and type( response.json()["data"]["gene"][json_key]) in [ str, int ]: with open( "outputs/" + search_term + "/" + search_term + ".txt", "a") as f: f.write("\n" + json_key + ":" + str(response.json()["data"]["gene"][json_key])) general_info += "\n" + json_key + ":" + str( response.json()["data"]["gene"][json_key]) if response.json( )["data"]["gene"][json_key] is not None and type( response.json()["data"]["gene"][json_key]) not in [ str, int ]: data = json_normalize( response.json()["data"]["gene"][json_key]) data.columns = data.columns.map(lambda x: x.split(".")[-1]) data.to_csv("outputs/" + search_term + "/" + json_key + ".tsv", sep="\t", index=False) if (len(data) > 0) and (mode == "single"): st.markdown("\n **Table for: `" + json_key + "`**") st.dataframe(data) general_info += "```" if mode == "single": st.markdown("--- \n **General Info for your query**") st.info(general_info) elif filter_by == "gene_name": if not os.path.exists('outputs/' + search_term + "/"): os.mkdir('outputs/' + search_term + "/") else: shutil.rmtree('outputs/' + search_term + "/") os.mkdir('outputs/' + search_term + "/") json_keys = list(response.json()["data"]["gene"].keys()) general_info = "```" for json_key in json_keys: # print(json_key, type(response.json()["data"]["gene"][json_key]), response.json()["data"]["gene"][json_key] is None, type(response.json()["data"]["gene"][json_key]) not in [str, int]) if response.json( )["data"]["gene"][json_key] is not None and type( response.json()["data"]["gene"][json_key]) in [ str, int ]: with open( "outputs/" + search_term + "/" + search_term + ".txt", "a") as f: f.write("\n" + json_key + ": " + str(response.json()["data"]["gene"][json_key])) general_info += ( "\n" + json_key + ": " + str(response.json()["data"]["gene"][json_key])) if response.json( )["data"]["gene"][json_key] is not None and type( response.json()["data"]["gene"][json_key]) not in [ str, int ]: data = json_normalize( response.json()["data"]["gene"][json_key]) data.columns = data.columns.map(lambda x: x.split(".")[-1]) data.to_csv("outputs/" + search_term + "/" + json_key + ".tsv", sep="\t", index=False) if (len(data) > 0) and (mode == "single"): st.markdown("\n **Table for: `" + json_key + "`**") st.dataframe(data) general_info += "```" if mode == "single": st.markdown("--- \n **General Info for your query**") st.info(general_info) return response
def to_dataframe(self): """ Returns the list as a pandas DataFrame. """ dict_list = [obj.__dict__() for obj in self] return json_normalize(dict_list, sep="_")
VERSION = '20180604' LIMIT = 30 # In[12]: radius = 700 LIMIT = 100 url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format( CLIENT_ID, CLIENT_SECRET, VERSION, latitude_n1, longitude_n1, radius, LIMIT) results = requests.get(url).json() # In[13]: venues = results['response']['groups'][0]['items'] nearby_venues = json_normalize(venues) nearby_venues.columns # In[14]: def get_category_type(row): try: categories_list = row['categories'] except: categories_list = row['venue.categories'] if len(categories_list) == 0: return None else: return categories_list[0]['name']
darksky_api = keys["darksky_api"][0] # location google = "https://maps.googleapis.com/maps/api/geocode/json?address=Cape Town&key=" + google_api resp_loc = requests.get(google) lat = json.loads(resp_loc.content)["results"][0]["geometry"]["location"]["lat"] lon = json.loads(resp_loc.content)["results"][0]["geometry"]["location"]["lng"] # weather dates = [str(int((dt.datetime.now(pytz.utc) - dt.timedelta(days=x)).timestamp())) for x in range(365)] weather_df = pd.DataFrame() i = 0 for d in dates: print(str(i) + ": " + str(d)) darksky = "https://api.darksky.net/forecast/" + darksky_api + "/" + str(lat) + "," + str(lon) + "," + d + "?exclude=hourly,alerts,flags" weather_df = weather_df.append(json_normalize(json.loads(requests.get(darksky).content)["currently"])) i += 1 # weather_df.to_pickle("/Users/phil/vscode/weather_lambda/model_training/data/weather_df_raw.pkl") weather_df = pd.read_pickle("/Users/phil/vscode/weather_lambda/model_training/data/weather_df_raw.pkl") # datetime and order weather_df = weather_df.reset_index() weather_df["datetime"] = [dt.datetime.fromtimestamp(weather_df["time"][i]) for i in range(0, len(weather_df))] weather_df = weather_df.sort_values(by=["datetime"]) # take a look weather_df.dtypes weather_df["precipIntensity"] weather_df["precipProbability"] weather_df["precipType"].unique()
# In[7]: # Send the GET Request and examine the results results = requests.get(url).json() #results # In[9]: # assign relevant part of JSON to venues venues = results['response']['venues'] # tranform venues into a dataframe dataframe = json_normalize(venues) dataframe.head() # # Clean university Dataframe # In[10]: # keep only columns that include venue name, and anything that is associated with location clean_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')]+ ['id'] clean_dataframe = dataframe.loc[:,clean_columns] # function that extracts the category of the venue def get_category_type(row):
def to_dataframe(self): """ Returns this series and all its indexes as a pandas DataFrame. """ dict_list = [obj.__dict__() for obj in self.indexes] return json_normalize(dict_list, sep="_")
def mine_data(file_name=None, first_match_id=first_match, last_match_id=last_match, stop_at=None, timeout=15, save_every=1000): """ Mine data using the official Opendota API. Keep requests at a decent rate (3/s). For every request, a JSON containing 100 games is returned. The games are downloaded in descending order of the match IDs. Args: file_name: the name of the file where the dataframe will be stored first_match_id: lowest match ID to look at; currently set at the start of 7.06e last_match_id: highest match ID to look at; currently start at the end of 7.06e stop_at: when the dataframe contains stop_at games, the mining stops timeout: in case Opendota does not respond, wait timeout seconds before retrying save_every: save the dataframe every save_every entries Returns: dataframe with the mined games """ global OPENDOTA_URL global REQUEST_TIMEOUT global COLUMNS global logger results_dataframe = pd.DataFrame() current_chunk = 1 current_match_id = last_match_id games_remaining = stop_at while current_match_id > first_match_id: try: current_link = OPENDOTA_URL + str(current_match_id) logger.info("Mining chunk starting at match ID %d", current_match_id) response = urllib2.urlopen(current_link, timeout=timeout) except (urllib2.URLError, ssl.SSLError) as error: logger.error("Failed to make a request starting at match ID %d", current_match_id) logger.info("Waiting %d seconds before retrying", timeout) time.sleep(timeout) current_match_id -= 1 continue try: response_json = json.load(response) last_match_id = response_json[-1]['match_id'] except (ValueError, KeyError) as error: logger.error("Corrupt JSON starting at match ID %d, skipping it", current_match_id) current_match_id -= 1 continue current_match_id = last_match_id if games_remaining: games_remaining -= len(response_json) current_dataframe = json_normalize(response_json) if len(current_dataframe) == 0: logger.info("Found an empty dataframe, skipping 10 games") current_match_id -= 10 continue results_dataframe = results_dataframe.append(current_dataframe, ignore_index=True) if len(results_dataframe) > current_chunk * save_every: current_chunk += 1 if file_name: pd.DataFrame(results_dataframe, columns=COLUMNS).to_csv(file_name, index=False) logger.info("Saving to csv. Total of games mined: %d", len(results_dataframe)) if stop_at: if len(results_dataframe) > stop_at: return results_dataframe if stop_at: if len(results_dataframe) > stop_at: break time.sleep(REQUEST_TIMEOUT) return results_dataframe
from configparser import ConfigParser from bson import json_util from pandas.io.json import json_normalize from pymongo import MongoClient config_parser = ConfigParser() config_parser.read('../config/reader-config.ini', encoding='utf-8') mongo_client = MongoClient(config_parser.get('mongo', 'host'), int(config_parser.get('mongo', 'port'))) tweets = mongo_client[config_parser.get('mongo', 'db')][config_parser.get('mongo', 'collection')] data = tweets.find({}) # load MongoDB data as JSON data and flatten using json_normalize sanitized = json.loads(json_util.dumps(data)) # replace new line with space for i, j in enumerate(sanitized): j['text'] = '"' + j['text'].strip().replace("\n", " ") + '"' normalized = json_normalize(sanitized) normalized.to_csv( path_or_buf="../../../data-set/data-set.csv", columns=[column.strip() for column in config_parser.get('csv', 'columns').split(',')], encoding="utf-8", index_label="instance_id" ) for i, j in enumerate(sanitized): print(i, j['id'], j['user']['screen_name'], j['truncated'], j['text'])
def create_pull_requests_comments_df(owner, repo, api): pull_requests_comments_list = pull_requests_comments_of_repo_github( owner, repo, api) return json_normalize(pull_requests_comments_list)
def getdata(url): res = requests.get(url) data = res.json() df_api = json_normalize(data) return df_api
def define_obejct(self): # Getting values using key on dictionary format def extract_items(dict_data, keys): new_dict = {} for i in dict_data.keys(): if i in keys: new_dict[i] = dict_data[i] return new_dict defined_obejct_df = pd.DataFrame() try: with open(self.path, encoding='UTF8') as f: # 1. load .Json data = json.load(f) # 2. Extract data (What will we use?) keys = ['count', 'edges'] j1 = extract_items(data, keys) # 3. Normalize Json format j2 = json_normalize(j1['edges']) # 4. Drop columns that don't be needed and then rename columns original = set(j2.columns) fixed = { 'node.display_url', 'node.edge_liked_by.count', 'node.edge_media_to_caption.edges', 'node.edge_media_to_comment.count', 'node.id', 'node.owner.id', 'node.taken_at_timestamp' } j2.drop(list(original - fixed), axis=1, inplace=True) # Clear sets that don't use anymore original.clear() fixed.clear() # rename columns j2.columns = [ 'contents_url', 'like_count', 'post', 'comment_count', 'post_id', 'user_id', 'timeStamp' ] # 5. Extracting tags form text (use regex) tag_list = [] for row in j2['post']: if not bool(row): tag_list.append("null+nan+none") else: re_row = row[0]['node']['text'] if re_row.find("#") > -1: p = re.compile('#([^#\s]+)') tag_from_row = p.findall(re_row) tag_from_row = [x.strip('#') for x in tag_from_row] row = ' '.join(tag_from_row) tag_list.append(row) else: tag_list.append(None) # 6. Create new columns and insert tag list into created columns j2['extracted_tags'] = tag_list defined_obejct_df = defined_obejct_df.append(j2, ignore_index=True) print("행 {}, 열 {}로 구성된 데이터 프레임입니다.".format( defined_obejct_df.shape[0], defined_obejct_df.shape[1])) except: print("정규화할 데이터가 없네요! 확인하세요.") print("경로: {}".format(self.path)) finally: return defined_obejct_df
parser = argparse.ArgumentParser(description='csg conversion tool.') parser.add_argument('-i', '--input', help='Filename to read in', required=True) parser.add_argument('-x', '--x_axis', help='Material to plot of x axis', required=True) args = parser.parse_args() filename = args.input with open(os.path.join(filename)) as f: results = json.load(f) results_df = json_normalize(data=results) df_filtered_by_mat = results_df # df_filtered_by_mat = results_df[results_df['fw_material']=='eurofer'&&results_df['armour_material']=='tungsten'] x = list(df_filtered_by_mat['fw_thickness']) y = list(df_filtered_by_mat['armour_thickness']) z = list(df_filtered_by_mat['leakage_neutron_current.value']) z_e = list(df_filtered_by_mat['leakage_neutron_current.std_dev']) labels = [str(i) + '+-' + str(j) for i, j in zip(z, z_e)] if len(x) < 40: coords = list(zip(x, y)) GP = GpRegressor(coords, z, y_err=z_e)