예제 #1
0
def process_frame(filename):
    sf = gl.load_sframe(filename)
    
    output_frame = SFrame()
    
    #Setup our output frame
    id = []
    ip = []
    sub_count = []
    error_count = []
    time_count = []
    error_sequence_raw = []
    error_sequence = []
    
    #How many session ID's do we have?
    sa = gl.SArray()
    sa = sf['session_id']
    test = sa.unique()
    
    limit = len(test)
    
    #Start grabbing each session
    for i in range(1,limit):
        
        #Test output
        if (i % 100 == 0):   
            break 
        
        #Get the session and sort it by the date time
        session_frame = sf.filter_by(i,"session_id")
        #sorted_session = session_frame.sort("dt")
        
        row = sf[0]
        
        id += [i]
        ip += [row['ip']]
        sub_count += [len(row)]
        #time_count += [fn_time_count(sorted_session)]
        #error_count += [fn_error_count(sorted_session)]
        #error_sequence_raw += [fn_error_sequence_raw(sorted_session)]
    
    print len(id)
    print len(ip)
    print len(sub_count)
    #print len(time_count)
    
    output_frame = output_frame.add_column(SArray(id), name='id')
    output_frame.add_column(SArray(ip), name='ip')
    output_frame.add_column(SArray(sub_count),name='sub_count')
    #output_frame.add_column(SArray(time_count),name='sub_length')
    #output_frame.add_column(SArray(error_count),name='error_count')
    #output_frame.add_column(SArray(error_sequence_raw,dtype=str),name='err_seq_raw')

    output_frame.save('py2_session_analysis')
예제 #2
0
def create_frame_from_file(file_name):
    n_total_lines = 220000
    sf = SFrame()
    with open(file_name) as data:
        dt = []
        ip = []
        py = []
        script = []
        id = []
        for i, line in enumerate(data):
            jo = json.loads(line)
            dt += jo['dt']
            ip += jo['ip']
            py += jo['py']
            id += [i]
            script += jo['user_script']

            if i % 100 == 0:
                print float(i) / n_total_lines

        sf = sf.add_column(SArray(id), name='id')
        sf.add_column(SArray(dt), name='dt')
        sf.add_column(SArray(ip), name='ip')
        sf.add_column(SArray(py, dtype=str), name='py')
        sf.add_column(SArray(script), name='user_script')

        sf.save('python_tutor')
    return sf
예제 #3
0
    def data_frame_with_target(self, data_frame):
        """

        :param data_frame:
        :type data_frame: DataFrame
        :return:
        :rtype: SFrame
        """
        data_sframe = SFrame(data_frame.toPandas())
        sentiment_array = data_sframe.select_column('sentiment')
        target_array = []
        for x in sentiment_array:
            try:
                target_array.append(self.convert_func(x))
            except Exception as ex:
                print len(target_array), 'get_sentiments', x
                target_array.append(3)
                print ex

        data_sframe.add_column(SArray(target_array, dtype=int), name='target')
        print data_sframe
        return data_sframe.dropna()
예제 #4
0
def create_sessions(sf=SFrame()):
    assert(type(sf) == SFrame)
    ip = []
    user_script = []
    err_msg = []
    compile_err = []
    of_interest = []
    ignored = 0

    for i in xrange(len(sf)):
        count = sf['count'][i]
        if count != len(sf['id'][i]):
            ignored += 1
            continue

        tip = sf['ip'][i]
        chunk_user_script = cut_dict_by_dt(sf['user_script'][i])
        user_script += chunk_user_script
        ip += [tip,] * len(chunk_user_script)

        err_msg += cut_dict_by_dt(sf['err_msg'][i])

        chunk_compile_err = cut_dict_by_dt(sf['compile_err'][i])
        compile_err += chunk_compile_err

        of_interest += set_of_interest_bit(chunk_compile_err)

    print "DEBUG:", "ignored:", ignored
    rst = SFrame()
    rst.add_column(SArray(ip, dtype=str), name='ip')
    rst.add_column(SArray(user_script, dtype=dict), name='user_script')
    rst.add_column(SArray(err_msg, dtype=dict), name='err_msg')
    rst.add_column(SArray(compile_err, dtype=dict), name='compile_err')
    rst.add_column(SArray(of_interest, dtype=int), name='of_interest')

    return rst
예제 #5
0
def build_data_graph():
  file_path = "/Users/blahiri/healthcare/documents/recommendation_system/"
  beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv")
  bene_packed = beneficiaries.pack_columns(column_prefix = 'chron_', dtype = dict, new_column_name = 'chronic_conditions', remove_prefix = False)
  
  #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), 
  #and the outer [] makes sure we emit a list of lists.
  bene_chrons = bene_packed.flat_map(["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], 
                                     lambda x:[list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems()])
 

  bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1]
  del bene_chrons['chronic_condition_value']
  bene_chrons.rename({'chronic_condition_name': 'chronic_condition'})

  g = SGraph()
  bene_chrons['relation'] = 'had_chronic'
  g = g.add_edges(bene_chrons, src_field = 'desynpuf_id', dst_field = 'chronic_condition')
  print g.summary()
 
  #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query
  bene_with_chrons = SFrame(None)
  bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id')
  
  #Add edges to the graph indicating which patient had which diagnosed condition
  tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv")
  cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year']
  for column in cols_to_drop:
     del tcdc[column]
  #Same patient can be diagnosed with same condition multiple times a year, so take distinct
  tcdc = tcdc.unique()
  #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no diagnosed condition, however.
  bene_chrons_tcdc = bene_with_chrons.join(tcdc)
  
  bene_chrons_tcdc['relation'] = 'diagnosed_with'
  g = g.add_edges(bene_chrons_tcdc, src_field = 'desynpuf_id', dst_field = 'dgns_cd')
  print g.summary()

  
  #Add edges to the graph indicating which patient had which procedure
  tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints = {'prcdr_cd' : str})
  cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year']
  for column in cols_to_drop:
     del tcpc[column]
  tcpc = tcpc.unique()
  #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no procedure, however.
  bene_chrons_tcpc = bene_with_chrons.join(tcpc)
  bene_chrons_tcpc['relation'] = 'underwent'
  g = g.add_edges(bene_chrons_tcpc, src_field = 'desynpuf_id', dst_field = 'prcdr_cd')
  print g.summary()

  #Add edges to the graph indicating which patient had which medicine
  pde = SFrame.read_csv(file_path + "prescribed_drugs.csv")
  pde = pde.unique()
  #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no medicine, however.
  bene_chrons_pde = bene_with_chrons.join(pde)
  bene_chrons_pde['relation'] = 'had_drug'
  g = g.add_edges(bene_chrons_pde, src_field = 'desynpuf_id', dst_field = 'substancename')
  print g.summary()
   
  return g
예제 #6
0
def process_frame(frame_name):
    
    #Setup columns for the new frame
    session_id = []
    ip_address = []
    python_version = []
    interest = []
    submissions = []
    
    #Load in the frame we're processing
    frame = gl.load_sframe(frame_name)
    
    #Sort the frame by IP and then DT ASC
    sorted_frame = frame.sort(['ip','dt'])
    
    #Previous IP to see if we're looking at a new IP address
    previous_ip = 0
    previous_py = 0
    
    #Counters (for keys)
    record_counter = 1
    submission_counter = 1
    
    #Dictionary to hold submissions
    submissions_collection = {}
    
    #Looping through all records to break this up into
    #ip address and then 'session' chunks
    for i in xrange(len(sorted_frame)):
        if(i == 1):
            print sorted_frame['ip'][i]
            break;
        if(i % 100 == 0):
            print "processing record:" + str(i)
        if((sorted_frame['ip'][i] != previous_ip)):
            if(previous_ip != 0):
                #Add in the record to the frame
                session_id += str(record_counter)
                ip_address += str(previous_ip)
                python_version += str(previous_py)
                interest += str(is_interesting(submissions_collection))
                submissions += submissions_collection
                
                #Reset all values
                submissions_collection = {}
                previous_ip = sorted_frame['ip'][i]
                previous_py = sorted_frame['py'][i]
                record_counter = record_counter + 1
                submission_counter = 1
                
                #Create and append the submission
                d = {}
                d['date-time'] = sorted_frame['dt'][i]
                d['code_segment'] = sorted_frame['user_script'][i]
                d['error_message'] = sorted_frame['err_msg'][i]
                d['error_flag'] = sorted_frame['compile_err'][i]
                
                submissions_collection[str(submission_counter)] = d
                submission_counter = submission_counter + 1
                
            else:
                #Handling the very first record
                previous_ip = sorted_frame['ip'][i]
                previous_py = sorted_frame['py'][i]
                
                #Create and append the submission
                d = {}
                d['date-time'] = sorted_frame['dt'][i]
                d['code_segment'] = sorted_frame['user_script'][i]
                d['error_message'] = sorted_frame['err_msg'][i]
                d['error_flag'] = sorted_frame['compile_err'][i]
                
                submissions_collection[str(submission_counter)] = d
                submission_counter = submission_counter + 1
        else:
            #Create and append the submission
            d = {}
            d['date-time'] = sorted_frame['dt'][i]
            d['code_segment'] = sorted_frame['user_script'][i]
            d['error_message'] = sorted_frame['err_msg'][i]
            d['error_flag'] = sorted_frame['compile_err'][i]
            
            submissions_collection[str(submission_counter)] = d
            submission_counter = submission_counter + 1
            
    #Finally, create the frame and save it!
    
    print ip_address
    print len(session_id)
    print len(ip_address)
    print len(python_version)
    print len(submissions)
    
    rst = SFrame()
    rst.add_column(SArray(session_id, dtype=str), name='session_id')
    rst.add_column(SArray(ip_address, dtype=str), name='ip_address')
    rst.add_column(SArray(python_version, dtype=str), name='python_version')
    rst.add_column(SArray(submissions, dtype=dict), name='submissions')

    rst.save("test_frame")          
예제 #7
0
def build_data_graph():
    file_path = "/Users/blahiri/healthcare/documents/recommendation_system/"
    beneficiaries = SFrame.read_csv(file_path +
                                    "beneficiary_summary_2008_2009.csv")
    bene_packed = beneficiaries.pack_columns(
        column_prefix='chron_',
        dtype=dict,
        new_column_name='chronic_conditions',
        remove_prefix=False)

    #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(),
    #and the outer [] makes sure we emit a list of lists.
    bene_chrons = bene_packed.flat_map(
        ["chronic_condition_name", "chronic_condition_value", "desynpuf_id"],
        lambda x: [
            list(k + (x['desynpuf_id'], ))
            for k in x['chronic_conditions'].iteritems()
        ])

    bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1]
    del bene_chrons['chronic_condition_value']
    bene_chrons.rename({'chronic_condition_name': 'chronic_condition'})

    g = SGraph()
    bene_chrons['relation'] = 'had_chronic'
    g = g.add_edges(bene_chrons,
                    src_field='desynpuf_id',
                    dst_field='chronic_condition')
    print g.summary()

    #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query
    bene_with_chrons = SFrame(None)
    bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(),
                                'desynpuf_id')

    #Add edges to the graph indicating which patient had which diagnosed condition
    tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv")
    cols_to_drop = [
        'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'
    ]
    for column in cols_to_drop:
        del tcdc[column]
    #Same patient can be diagnosed with same condition multiple times a year, so take distinct
    tcdc = tcdc.unique()
    #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no diagnosed condition, however.
    bene_chrons_tcdc = bene_with_chrons.join(tcdc)

    bene_chrons_tcdc['relation'] = 'diagnosed_with'
    g = g.add_edges(bene_chrons_tcdc,
                    src_field='desynpuf_id',
                    dst_field='dgns_cd')
    print g.summary()

    #Add edges to the graph indicating which patient had which procedure
    tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv",
                           column_type_hints={'prcdr_cd': str})
    cols_to_drop = [
        'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'
    ]
    for column in cols_to_drop:
        del tcpc[column]
    tcpc = tcpc.unique()
    #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no procedure, however.
    bene_chrons_tcpc = bene_with_chrons.join(tcpc)
    bene_chrons_tcpc['relation'] = 'underwent'
    g = g.add_edges(bene_chrons_tcpc,
                    src_field='desynpuf_id',
                    dst_field='prcdr_cd')
    print g.summary()

    #Add edges to the graph indicating which patient had which medicine
    pde = SFrame.read_csv(file_path + "prescribed_drugs.csv")
    pde = pde.unique()
    #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no medicine, however.
    bene_chrons_pde = bene_with_chrons.join(pde)
    bene_chrons_pde['relation'] = 'had_drug'
    g = g.add_edges(bene_chrons_pde,
                    src_field='desynpuf_id',
                    dst_field='substancename')
    print g.summary()

    return g
예제 #8
0
def main():
    with open('../../Data/data_file_modified.txt') as data:
        sf = SFrame()
        
        # Data model format
        # RecordID | Date/Time | IP Address | Python Version |
        # User Script | Compile Flag | Compile Message
        id = []
        dt = []
        ip = []
        py = []
        script = []
        error = []       
        error_msg = [] 
        
        for i, line in enumerate(data):
            jo = json.loads(line)
            
            # Two different version of Python script
            # need to be compiled on different interpreters
            if(jo['py'][0] == 3):
            
                # Setup the data model we're using
                id += [i]
                dt += jo['dt']
                ip += jo['ip']
                py += jo['py']            
                script += jo['user_script']  
                
                # Run the script on the compile method
                # and obtain any error message
                flag = False
                msg = ""
                
                pattern = "is local and global"
                
                try:
                    compile(jo['user_script'][0],'<string>','exec')
                except SyntaxError, e:
                    if(re.search(pattern, str(e))):
                        msg = "Variable is Local and Global"
                    else:
                        msg = str(e)
                    flag = True
                
                
                if(flag):
                    error += [1]
                else:
                    error += [0] 
                
                # We need to chop off the error type
                # and remove the (filename line number)
                # to have any meaning here.
                fix_msg = msg.partition('(')[0]
                error_msg += [fix_msg.strip()]        
       
        sf = sf.add_column(SArray(id), name='id')
        sf.add_column(SArray(dt), name='dt')
        sf.add_column(SArray(ip), name='ip')
        sf.add_column(SArray(py, dtype=str), name='py')
        sf.add_column(SArray(script), name='user_script')
        sf.add_column(SArray(error), name='compile_err')
        sf.add_column(SArray(error_msg), name='err_msg')

        sf.save('py3_error_frame_clean')