Пример #1
0
    def significant_nei_utm_ids():
        mf_utm_id_to_valid_nei_utm_ids = {}
        def get_utm_vectors():
            so_hashtags = set()
            for utm_object in \
                    FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
                for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems():
                    if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag)
                mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\
                                                                utm_object['mf_nei_utm_id_to_common_h_count'].keys()
            hashtags, ltuo_utm_id_and_vector = sorted(list(so_hashtags)), []
            for i, utm_object in enumerate(FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True)):
#                print i, utm_object['utm_id']
                utm_id_vector =  map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0),
                                     hashtags)
                ltuo_utm_id_and_vector.append((utm_object['utm_id'], 
                                               robjects.FloatVector(utm_id_vector)))
            od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0)))
            df_utm_vectors = robjects.DataFrame(od)
            return df_utm_vectors
        output_file = fld_google_drive_data_analysis%GeneralMethods.get_method_id()
        df_utm_vectors = get_utm_vectors()
#        print df_utm_vectors.nrow
#        exit()
        utm_colnames = df_utm_vectors.colnames
        mf_utm_id_to_utm_colnames = dict(zip(sorted(mf_utm_id_to_valid_nei_utm_ids), utm_colnames))
        mf_utm_colnames_to_utm_id = dict(zip(utm_colnames, sorted(mf_utm_id_to_valid_nei_utm_ids)))
        for i, utm_colname in enumerate(utm_colnames):
            utm_id = mf_utm_colnames_to_utm_id[utm_colname]
            prediction_variable = utm_colname
            print i, utm_id
            predictor_variables = [mf_utm_id_to_utm_colnames[valid_nei_utm_ids]
                                    for valid_nei_utm_ids in mf_utm_id_to_valid_nei_utm_ids[utm_id]
                                        if valid_nei_utm_ids in mf_utm_id_to_utm_colnames and
                                           valid_nei_utm_ids != utm_id ]
            selected_utm_colnames =  R_Helper.variable_selection_using_backward_elimination(
                                                                                               df_utm_vectors,
                                                                                               prediction_variable,
                                                                                               predictor_variables,
                                                                                               debug=True
                                                                                            )
            nei_utm_ids = [mf_utm_colnames_to_utm_id[selected_utm_colname]
                                for selected_utm_colname in selected_utm_colnames]
            print 'Writing to: ', output_file
            FileIO.writeToFileAsJson({'utm_id': utm_id, 'nei_utm_ids': nei_utm_ids}, output_file)
Пример #2
0
 def mapper(self, key, line):
     data_for_df = cjson.decode(line)
     prediction_variable = data_for_df['prediction_variable']
     predictor_variables = data_for_df['predictor_variables']
     mf_utm_colnames_to_utm_ids = data_for_df['mf_utm_colnames_to_utm_ids']
     del data_for_df['prediction_variable']
     del data_for_df['predictor_variables']
     del data_for_df['mf_utm_colnames_to_utm_ids']
     df_utm_vectors = R_Helper.get_data_frame_from_json(cjson.encode(data_for_df))
     selected_utm_colnames = R_Helper.variable_selection_using_backward_elimination(
                                                                                    df_utm_vectors,
                                                                                    prediction_variable,
                                                                                    predictor_variables
                                                                                 )
     utm_id = mf_utm_colnames_to_utm_ids[prediction_variable]
     nei_utm_ids = [mf_utm_colnames_to_utm_ids[selected_utm_colname]
                    for selected_utm_colname in selected_utm_colnames]
     yield utm_id, {'utm_id': utm_id, 'nei_utm_ids': nei_utm_ids}