def process_frame(filename): sf = gl.load_sframe(filename) output_frame = SFrame() #Setup our output frame id = [] ip = [] sub_count = [] error_count = [] time_count = [] error_sequence_raw = [] error_sequence = [] #How many session ID's do we have? sa = gl.SArray() sa = sf['session_id'] test = sa.unique() limit = len(test) #Start grabbing each session for i in range(1,limit): #Test output if (i % 100 == 0): break #Get the session and sort it by the date time session_frame = sf.filter_by(i,"session_id") #sorted_session = session_frame.sort("dt") row = sf[0] id += [i] ip += [row['ip']] sub_count += [len(row)] #time_count += [fn_time_count(sorted_session)] #error_count += [fn_error_count(sorted_session)] #error_sequence_raw += [fn_error_sequence_raw(sorted_session)] print len(id) print len(ip) print len(sub_count) #print len(time_count) output_frame = output_frame.add_column(SArray(id), name='id') output_frame.add_column(SArray(ip), name='ip') output_frame.add_column(SArray(sub_count),name='sub_count') #output_frame.add_column(SArray(time_count),name='sub_length') #output_frame.add_column(SArray(error_count),name='error_count') #output_frame.add_column(SArray(error_sequence_raw,dtype=str),name='err_seq_raw') output_frame.save('py2_session_analysis')
def create_frame_from_file(file_name): n_total_lines = 220000 sf = SFrame() with open(file_name) as data: dt = [] ip = [] py = [] script = [] id = [] for i, line in enumerate(data): jo = json.loads(line) dt += jo['dt'] ip += jo['ip'] py += jo['py'] id += [i] script += jo['user_script'] if i % 100 == 0: print float(i) / n_total_lines sf = sf.add_column(SArray(id), name='id') sf.add_column(SArray(dt), name='dt') sf.add_column(SArray(ip), name='ip') sf.add_column(SArray(py, dtype=str), name='py') sf.add_column(SArray(script), name='user_script') sf.save('python_tutor') return sf
def data_frame_with_target(self, data_frame): """ :param data_frame: :type data_frame: DataFrame :return: :rtype: SFrame """ data_sframe = SFrame(data_frame.toPandas()) sentiment_array = data_sframe.select_column('sentiment') target_array = [] for x in sentiment_array: try: target_array.append(self.convert_func(x)) except Exception as ex: print len(target_array), 'get_sentiments', x target_array.append(3) print ex data_sframe.add_column(SArray(target_array, dtype=int), name='target') print data_sframe return data_sframe.dropna()
def create_sessions(sf=SFrame()): assert(type(sf) == SFrame) ip = [] user_script = [] err_msg = [] compile_err = [] of_interest = [] ignored = 0 for i in xrange(len(sf)): count = sf['count'][i] if count != len(sf['id'][i]): ignored += 1 continue tip = sf['ip'][i] chunk_user_script = cut_dict_by_dt(sf['user_script'][i]) user_script += chunk_user_script ip += [tip,] * len(chunk_user_script) err_msg += cut_dict_by_dt(sf['err_msg'][i]) chunk_compile_err = cut_dict_by_dt(sf['compile_err'][i]) compile_err += chunk_compile_err of_interest += set_of_interest_bit(chunk_compile_err) print "DEBUG:", "ignored:", ignored rst = SFrame() rst.add_column(SArray(ip, dtype=str), name='ip') rst.add_column(SArray(user_script, dtype=dict), name='user_script') rst.add_column(SArray(err_msg, dtype=dict), name='err_msg') rst.add_column(SArray(compile_err, dtype=dict), name='compile_err') rst.add_column(SArray(of_interest, dtype=int), name='of_interest') return rst
def build_data_graph(): file_path = "/Users/blahiri/healthcare/documents/recommendation_system/" beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv") bene_packed = beneficiaries.pack_columns(column_prefix = 'chron_', dtype = dict, new_column_name = 'chronic_conditions', remove_prefix = False) #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), #and the outer [] makes sure we emit a list of lists. bene_chrons = bene_packed.flat_map(["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], lambda x:[list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems()]) bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1] del bene_chrons['chronic_condition_value'] bene_chrons.rename({'chronic_condition_name': 'chronic_condition'}) g = SGraph() bene_chrons['relation'] = 'had_chronic' g = g.add_edges(bene_chrons, src_field = 'desynpuf_id', dst_field = 'chronic_condition') print g.summary() #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query bene_with_chrons = SFrame(None) bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id') #Add edges to the graph indicating which patient had which diagnosed condition tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv") cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'] for column in cols_to_drop: del tcdc[column] #Same patient can be diagnosed with same condition multiple times a year, so take distinct tcdc = tcdc.unique() #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no diagnosed condition, however. bene_chrons_tcdc = bene_with_chrons.join(tcdc) bene_chrons_tcdc['relation'] = 'diagnosed_with' g = g.add_edges(bene_chrons_tcdc, src_field = 'desynpuf_id', dst_field = 'dgns_cd') print g.summary() #Add edges to the graph indicating which patient had which procedure tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints = {'prcdr_cd' : str}) cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'] for column in cols_to_drop: del tcpc[column] tcpc = tcpc.unique() #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no procedure, however. bene_chrons_tcpc = bene_with_chrons.join(tcpc) bene_chrons_tcpc['relation'] = 'underwent' g = g.add_edges(bene_chrons_tcpc, src_field = 'desynpuf_id', dst_field = 'prcdr_cd') print g.summary() #Add edges to the graph indicating which patient had which medicine pde = SFrame.read_csv(file_path + "prescribed_drugs.csv") pde = pde.unique() #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no medicine, however. bene_chrons_pde = bene_with_chrons.join(pde) bene_chrons_pde['relation'] = 'had_drug' g = g.add_edges(bene_chrons_pde, src_field = 'desynpuf_id', dst_field = 'substancename') print g.summary() return g
def process_frame(frame_name): #Setup columns for the new frame session_id = [] ip_address = [] python_version = [] interest = [] submissions = [] #Load in the frame we're processing frame = gl.load_sframe(frame_name) #Sort the frame by IP and then DT ASC sorted_frame = frame.sort(['ip','dt']) #Previous IP to see if we're looking at a new IP address previous_ip = 0 previous_py = 0 #Counters (for keys) record_counter = 1 submission_counter = 1 #Dictionary to hold submissions submissions_collection = {} #Looping through all records to break this up into #ip address and then 'session' chunks for i in xrange(len(sorted_frame)): if(i == 1): print sorted_frame['ip'][i] break; if(i % 100 == 0): print "processing record:" + str(i) if((sorted_frame['ip'][i] != previous_ip)): if(previous_ip != 0): #Add in the record to the frame session_id += str(record_counter) ip_address += str(previous_ip) python_version += str(previous_py) interest += str(is_interesting(submissions_collection)) submissions += submissions_collection #Reset all values submissions_collection = {} previous_ip = sorted_frame['ip'][i] previous_py = sorted_frame['py'][i] record_counter = record_counter + 1 submission_counter = 1 #Create and append the submission d = {} d['date-time'] = sorted_frame['dt'][i] d['code_segment'] = sorted_frame['user_script'][i] d['error_message'] = sorted_frame['err_msg'][i] d['error_flag'] = sorted_frame['compile_err'][i] submissions_collection[str(submission_counter)] = d submission_counter = submission_counter + 1 else: #Handling the very first record previous_ip = sorted_frame['ip'][i] previous_py = sorted_frame['py'][i] #Create and append the submission d = {} d['date-time'] = sorted_frame['dt'][i] d['code_segment'] = sorted_frame['user_script'][i] d['error_message'] = sorted_frame['err_msg'][i] d['error_flag'] = sorted_frame['compile_err'][i] submissions_collection[str(submission_counter)] = d submission_counter = submission_counter + 1 else: #Create and append the submission d = {} d['date-time'] = sorted_frame['dt'][i] d['code_segment'] = sorted_frame['user_script'][i] d['error_message'] = sorted_frame['err_msg'][i] d['error_flag'] = sorted_frame['compile_err'][i] submissions_collection[str(submission_counter)] = d submission_counter = submission_counter + 1 #Finally, create the frame and save it! print ip_address print len(session_id) print len(ip_address) print len(python_version) print len(submissions) rst = SFrame() rst.add_column(SArray(session_id, dtype=str), name='session_id') rst.add_column(SArray(ip_address, dtype=str), name='ip_address') rst.add_column(SArray(python_version, dtype=str), name='python_version') rst.add_column(SArray(submissions, dtype=dict), name='submissions') rst.save("test_frame")
def build_data_graph(): file_path = "/Users/blahiri/healthcare/documents/recommendation_system/" beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv") bene_packed = beneficiaries.pack_columns( column_prefix='chron_', dtype=dict, new_column_name='chronic_conditions', remove_prefix=False) #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), #and the outer [] makes sure we emit a list of lists. bene_chrons = bene_packed.flat_map( ["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], lambda x: [ list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems() ]) bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1] del bene_chrons['chronic_condition_value'] bene_chrons.rename({'chronic_condition_name': 'chronic_condition'}) g = SGraph() bene_chrons['relation'] = 'had_chronic' g = g.add_edges(bene_chrons, src_field='desynpuf_id', dst_field='chronic_condition') print g.summary() #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query bene_with_chrons = SFrame(None) bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id') #Add edges to the graph indicating which patient had which diagnosed condition tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv") cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcdc[column] #Same patient can be diagnosed with same condition multiple times a year, so take distinct tcdc = tcdc.unique() #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no diagnosed condition, however. bene_chrons_tcdc = bene_with_chrons.join(tcdc) bene_chrons_tcdc['relation'] = 'diagnosed_with' g = g.add_edges(bene_chrons_tcdc, src_field='desynpuf_id', dst_field='dgns_cd') print g.summary() #Add edges to the graph indicating which patient had which procedure tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints={'prcdr_cd': str}) cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcpc[column] tcpc = tcpc.unique() #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no procedure, however. bene_chrons_tcpc = bene_with_chrons.join(tcpc) bene_chrons_tcpc['relation'] = 'underwent' g = g.add_edges(bene_chrons_tcpc, src_field='desynpuf_id', dst_field='prcdr_cd') print g.summary() #Add edges to the graph indicating which patient had which medicine pde = SFrame.read_csv(file_path + "prescribed_drugs.csv") pde = pde.unique() #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no medicine, however. bene_chrons_pde = bene_with_chrons.join(pde) bene_chrons_pde['relation'] = 'had_drug' g = g.add_edges(bene_chrons_pde, src_field='desynpuf_id', dst_field='substancename') print g.summary() return g
def main(): with open('../../Data/data_file_modified.txt') as data: sf = SFrame() # Data model format # RecordID | Date/Time | IP Address | Python Version | # User Script | Compile Flag | Compile Message id = [] dt = [] ip = [] py = [] script = [] error = [] error_msg = [] for i, line in enumerate(data): jo = json.loads(line) # Two different version of Python script # need to be compiled on different interpreters if(jo['py'][0] == 3): # Setup the data model we're using id += [i] dt += jo['dt'] ip += jo['ip'] py += jo['py'] script += jo['user_script'] # Run the script on the compile method # and obtain any error message flag = False msg = "" pattern = "is local and global" try: compile(jo['user_script'][0],'<string>','exec') except SyntaxError, e: if(re.search(pattern, str(e))): msg = "Variable is Local and Global" else: msg = str(e) flag = True if(flag): error += [1] else: error += [0] # We need to chop off the error type # and remove the (filename line number) # to have any meaning here. fix_msg = msg.partition('(')[0] error_msg += [fix_msg.strip()] sf = sf.add_column(SArray(id), name='id') sf.add_column(SArray(dt), name='dt') sf.add_column(SArray(ip), name='ip') sf.add_column(SArray(py, dtype=str), name='py') sf.add_column(SArray(script), name='user_script') sf.add_column(SArray(error), name='compile_err') sf.add_column(SArray(error_msg), name='err_msg') sf.save('py3_error_frame_clean')