Exemplo n.º 1
0
    def arffwriteToFile(self, X, Y, theFile):
        #TODO: Fix warning
        import arff

        arffFeatObj = {
            'description': 'infodens feats',
            'relation': 'translationese'
        }
        dims = X.get_shape()
        attrib = []

        # list of attributes
        for i in range(dims[1]):
            attribTuple = (str(i), "REAL")
            attrib.append(attribTuple)

        arffClasses = list(map(str, set(Y)))
        attrib.append(("y", arffClasses))

        Y = sparse.coo_matrix(Y).transpose()
        data = sparse.hstack([X, Y], "lil")

        arffFeatObj['attributes'] = attrib
        arffFeatObj['data'] = data.tocoo()

        thefile = open(theFile, 'w')
        arff.dump(arffFeatObj, thefile)
        thefile.close()
Exemplo n.º 2
0
 def produce_features_weka(self):
     root, _ = os.path.splitext(self.args.config)
     arff_f = '.'.join([root, 'arff'])
     relation = os.path.basename(root)
     names = [ 'site', 'code' ]
     names.extend(imap(str, self.indicators))
     arff.dump(arff_f, self._iter_rows(), relation=root, names=names)
Exemplo n.º 3
0
def main(instances):
    # data = [[1,2,'a'], [3, 4, 'john']]
    data = []
    attrs = 7
    for i in range(instances):
        attr = []
        for j in range(attrs):
            a = random.randint(0, 1)
            attr.append(a)
        prob = int(random.random() * 10)
        c = (attr[0] and attr[2]) or int((not attr[2] and not attr[3]) or (attr[6] and attr[4]))
        # 	if prob > 6: #attr.append(int(not c))
        # 	attr[3] = int(not(attr[3]))
        # 	attr[6] = int(not(attr[6])) #rob = 0.81
        # 	attr[4] = int(not(attr[4]))
        # 	attr[2] = int(not(attr[2]))
        # 	attr[0] = int(not(attr[0]))
        attr.append(c)
        data.append(attr)
    names = []
    for i in range(attrs):
        names.append("a" + str(i))
    names.append("result")

    arff.dump("result.arff", data, relation="boolean", names=names)
Exemplo n.º 4
0
 def map_columns_and_write_to_file(self):
     self.df = self.df[
         self.df['repo_name'].isin(self.top_ten_repos_by_count)
     ]
     self.df = self.df.replace({'repo_name': self.repo_id_to_name_map})
     self.df.drop(columns=['issue_id'], inplace=True, axis=1)
     self.grouped = self.df.groupby('repo_name')
     print("Beginning arff export")
     for name, group in self.grouped:
         current = group.reset_index()
         current.drop(
             columns=['index', 'repo_name'],
             inplace=True,
             axis=1
         )
         arff.dump(
             f'randomRepos/{name}.arff',
             current.values,
             relation=name,
             names=current.columns
         )
         print(f"{name}.arff completed!")
     # arff.dump(
     #     f'riivo.arff',
     #     self.df.values,
     #     relation='riivo',
     #     names=self.df.columns
     # )
     print("Finished arff export!")
Exemplo n.º 5
0
    def predict(self, articles):
        # modifies the provided articles dict

        data = {
            'attributes': [('title', 'STRING'), ('body', 'STRING'),
                           ('class', ['yes', 'no'])],
            'data': [],
            'description':
            u'',
            'relation':
            '0'
        }

        for urlid in sorted(articles.keys()):
            title = re.sub(r'\W', ' ', articles[urlid]['title'])
            body = re.sub(r'\W', ' ', articles[urlid]['summary'])
            data['data'].append([title, body, 'no'])

        # make the testing file 0.arff
        fnew = open("%s0.arff" % paths['weka.training_arff_dir'], 'w')
        arff.dump(fnew, data)
        fnew.close()

        predictions = self.__predict_arff()

        for urlid in sorted(articles.keys()):
            articles[urlid]['categories'] = []

        tids = self.__get_tids()
        for tid in sorted(tids):
            for (i, urlid) in enumerate(sorted(articles.keys())):
                if predictions[tid][i][0]:
                    articles[urlid]['categories'].append(str(tid))
Exemplo n.º 6
0
    def save_to_arff(self, file: Path, name=None) -> None:
        """Save all the events in all traces into an ARFF file for machine learning.

        Args:
            filename: the name of the file to save into.  Should end with '.arff'.
            name: optional relation name to identify this data inside the ARFF file.
                The default is the base name of 'file'.
        """
        if isinstance(file, str):
            print(
                f"WARNING: converting {file} to Path.  Please learn to speak pathlib."
            )
            file = Path(file)
        if name is None:
            name = file.stem
        data = self.to_pandas()
        attributes = [(n, self.arff_type(t))
                      for (n, t) in zip(data.columns, data.dtypes)]
        try:
            import arff
        except ImportError:
            print("Please install ARFF support before using save_to_arff.")
            print("It is a pip only package:  pip install liac-arff")
            return
        with file.open("w") as output:
            contents = {
                "relation": safe_name(name),
                "attributes": attributes,
                "data": data.values,  # [[tr] for tr in trace_summaries],
                "description": "Events from " + name
            }
            arff.dump(contents, output)
Exemplo n.º 7
0
def save_data_arff(dataset, dataset_filename, arff_data=None):
    """Saves the dataset, arff-formatted, to dataset_filename.
    if arff_data is provided, dataset is used for arff_data['data']."""

    if arff_data is not None:
        arff_data['data'] = dataset.values.tolist()

    else:
        real_attrs = [(name, 'REAL') for name in dataset.select_dtypes(
            include='floating').columns.values]
        integer_attrs = [
            (name, 'INTEGER')
            for name in dataset.select_dtypes(include='integer').columns.values
        ]
        nominal_attrs = [
            (name, list(dataset[name].unique()))
            for name in dataset.select_dtypes(include='object').columns.values
        ]
        # re-arrange columns like above
        dataset = dataset[[
            name for (name, type) in real_attrs + integer_attrs + nominal_attrs
        ]]
        dataset_name = os.path.splitext(os.path.basename(dataset_filename))[0]
        arff_data = {
            'relation': dataset_name,
            'attributes': real_attrs + integer_attrs + nominal_attrs,
            'data': dataset.values.tolist()
        }

    arff.dump(arff_data, open(dataset_filename, 'w'))
    print("Dataset saved as {}".format(dataset_filename))
Exemplo n.º 8
0
def dump_to_arff(df, relation_name, description, output_file):
    
    attributes = []
    for col_n in df.columns:
        
        if df[col_n].dtypes == 'object':
            # sort nominal attributes
            nominal_att_list = df[col_n].unique().tolist()
            list.sort(nominal_att_list)
            # remove missing value mark from the attribute list
            if '?' in nominal_att_list:
                nominal_att_list.remove('?')
            attributes.append((col_n, nominal_att_list))
        else:
            attributes.append((col_n, 'NUMERIC'))

    arff_dic = {
        'attributes': attributes,
        'data': df.values,
        'relation': relation_name,
        'description': description
    }

    with open(output_file, 'w', encoding="utf8") as f:
        arff.dump(arff_dic, f)
Exemplo n.º 9
0
def run(args):
    root = logging.getLogger()
    root.setLevel(logging.INFO)

    config_space = sklearnbot.config_spaces.get_config_space(
        args.classifier_name, args.random_seed)

    meta_data = openmldefaults.utils.get_dataset_metadata(args.metadata_file)
    if args.scoring not in meta_data['measure']:
        raise ValueError('Could not find measure: %s' % args.scoring)
    metadata_frame = openmldefaults.utils.metadata_file_to_frame(
        args.metadata_file, config_space, args.scoring)

    df_surrogate = openmldefaults.utils.generate_grid_dataset(
        metadata_frame, config_space, args.resized_grid_size, args.scoring,
        args.random_seed)
    # if df_surrogate.shape[1] < num_params + len(study.tasks) / 2:
    #    raise ValueError('surrogate frame has too few columns. Min: %d Got %d' % (num_params + len(study.tasks) / 2,
    #                                                                              df_surrogate.shape[1]))
    os.makedirs(args.output_directory, exist_ok=True)
    df_surrogate.reset_index(inplace=True)
    arff_object = openmlcontrib.meta.dataframe_to_arff(
        df_surrogate, 'surrogate_%s' % args.classifier_name,
        json.dumps(meta_data))
    filename = os.path.join(
        args.output_directory, 'surrogate__%s__%s__c%d.arff' %
        (args.classifier_name, args.scoring, args.resized_grid_size))
    with open(filename, 'w') as fp:
        arff.dump(arff_object, fp)
    logging.info('Saved to: %s' % filename)
Exemplo n.º 10
0
	def convertToWeka(self, fileToConvert, classNominalValues, loanGradeNominalValues, numericAttributesNames, nominalAttributesNames, wekaFile):
		""" Generate Weka ARFF file from API downloader data"""

		converter = Converter()
		apiDataConverted = converter.convertDataFromFile(fileToConvert)

		data = self.prepareWekaData(apiDataConverted, numericAttributesNames, nominalAttributesNames)

		dataset = {}

		dataset['attributes'] = []
		
		for name in numericAttributesNames:
			attribute = (name, 'REAL')
			dataset['attributes'].append(attribute)


		loanGradeAttribute = ('loanGrade', loanGradeNominalValues)
		dataset['attributes'].append(loanGradeAttribute)

		classAttribute = ('noteStatus', classNominalValues)
		dataset['attributes'].append(classAttribute)
		
		
		dataset['data'] = data
		dataset['description'] =  u''
		dataset['relation'] = 'downloader data'

		arff.dump(open(wekaFile, 'w'), dataset)
Exemplo n.º 11
0
	def createSampleArff(self):
		"""Sample Weka ARFF file generation"""

		data = {
				'attributes': [
					('outlook', ['sunny', 'overcast', 'rainy']),
					('temperature', 'REAL'),
					('humidity', 'REAL'),
					('windy', ['TRUE', 'FALSE']),
					('play', ['yes', 'no'])],
				'data': [
					['sunny', 85.0, 85.0, None, 'no'],
					['sunny', 80.0, 90.0, 'TRUE', 'no'],
					['overcast', 83.0, 86.0, 'FALSE', 'yes'],
					['rainy', 70.0, 96.0, 'FALSE', 'yes'],
					['rainy', 68.0, 80.0, 'FALSE', 'yes'],
					['rainy', 65.0, 70.0, 'TRUE', 'no'],
					['overcast', 64.0, 65.0, 'TRUE', 'yes'],
					['sunny', 72.0, 95.0, 'FALSE', 'no'],
					['sunny', 69.0, 70.0, 'FALSE', 'yes'],
					['rainy', 75.0, 80.0, 'FALSE', 'yes'],
					['sunny', 75.0, 70.0, 'TRUE', 'yes'],
					['overcast', 72.0, 90.0, 'TRUE', 'yes'],
					['overcast', 81.0, 75.0, 'FALSE', 'yes'],
					['rainy', 71.0, 91.0, 'TRUE', 'no']],
				'description': u'',
				'relation': 'weather'
				}


		wekaFile = "../data/test.arff"
		arff.dump(open(wekaFile, 'w'), data)
Exemplo n.º 12
0
def replacingMissingValues():
    myArff = arff.load(open('competition-iaa-2018-2019/train.arff', 'r'))
    data = np.array(myArff['data'])

    positions1 = getPositionMissingOnes(0)
    data1 = getMissingDataAttr1()

    j = 0
    for i in positions1:
        data[i][8] = data1[j]
        j += 1

    positions2 = getPositionMissingOnes(1)
    data2 = getMissingDataAttr2()

    j = 0
    for i in positions2:
        data[i][9] = data2[j]
        j += 1

    positions3 = getPositionMissingOnes(2)
    data3 = getMissingDataAttr3()

    j = 0
    for i in positions3:
        data[i][10] = data3[j]
        j += 1

    myArff['data'] = data

    f = open('pruea.arff', 'w')
    arff.dump(myArff, f)
Exemplo n.º 13
0
    def arff_to_big_endian(cls, filename, dataset, n_labels):

        data = Dataset.load_arff(filename,
                                 n_labels,
                                 endian="little",
                                 input_feature_type='float',
                                 encode_nominal=True)
        new_data = np.concatenate((data['Y'], data['X']), axis=1)

        arff_frame = arff.load(open(filename, 'r'),
                               encode_nominal=True,
                               return_type=arff.DENSE)

        arff_frame['data'] = new_data.tolist()
        # make the labels nominal
        for i in range(data['Y'].shape[0]):
            for j in range(data['Y'].shape[1]):
                arff_frame['data'][i][j] = int(arff_frame['data'][i][j])

        arff_frame['attributes'] = arff_frame['attributes'][
            -n_labels:] + arff_frame['attributes'][:-n_labels]

        # nominal attributes to int format
        attributes = arff_frame['attributes']
        for j in range(data['Y'].shape[1],
                       data['X'].shape[1] + data['Y'].shape[1]):
            if isinstance(attributes[j][1], list):
                for i in range(data['Y'].shape[0]):
                    arff_frame['data'][i][j] = int(arff_frame['data'][i][j])

        arff_frame['relation'] = dataset + "_mlcsn: -C " + str(n_labels)
        f = open(filename, "w")
        arff.dump(arff_frame, f)
        f.close()
Exemplo n.º 14
0
def spit_datasets(filename=''):
    path1="../../data/UCI/" + filename + ".csv"
    df=pd.read_csv(path1,header=None)
    df[df.columns[-1]]=df[df.columns[-1]].apply(lambda x: True if x==1 else False)
    count=1
    for i in range(4):
        df = df.sample(frac=1).reset_index(drop=True)
        #dict, labels = df[df.columns[:-1]], df[df.columns[-1]]
        #skf = StratifiedKFold(n_splits=5, shuffle=False)
        arff.dump(data_path + "/CHIRP/Train/" + filename + str(count) + ".arff"
                  , df.values, relation='name', names=df.columns)
        # for train_index, test_index in skf.split(dict, labels):
        #     X_train, X_test = dict[dict.index.isin(train_index.tolist())], dict[dict.index.isin(test_index.tolist())]
        #     y_train, y_test = labels[labels.index.isin(train_index.tolist())], labels[labels.index.isin(test_index.tolist())]
        #     X_train["class"]=y_train
        #     X_test["class"]=y_test
        #     df = pd.concat([X_train, X_test], ignore_index=True)

            # arff.dump(data_path+"/CHIRP/Train/"+filename+str(count)+".arff"
            #           , df.values, relation='name', names=df.columns)
            # arff.dump(data_path + "/CHIRP/Test/" + filename + str(count) + ".arff"
            #           , X_test.values, relation='name', names=X_test.columns)

            #path2=data_path+"/CHIRP/Train/"+filename+str(count)+".csv"
            #path3=data_path+"/CHIRP/Test/"+filename+str(count)+".csv"
            #X_train.to_csv(path2,index=False)
            #X_test.to_csv(path3, index=False)
        count+=1
Exemplo n.º 15
0
def save_features_to_arff(all_features, output_file):

    dataset = {}
    dataset['description'] = 'Android Apps Dataset'
    dataset['relation'] = 'Android Apps Features for IR detection'
    dataset['attributes'] = [ \
                            ('Avg_Wordsize_Flds', 'REAL'),\
                            ('Avg_Distances_Flds', 'REAL'),\
                            ('Num_Flds_L1', 'REAL'),\
                            ('Num_Flds_L2', 'REAL'),\
                            ('Num_Flds_L3', 'REAL'),\
                            ('Avg_Wordsize_Mtds', 'REAL'),\
                            ('Avg_Distances_Mtds', 'REAL'),\
                            ('Num_Mtds_L1', 'REAL'),\
                            ('Num_Mtds_L2', 'REAL'),\
                            ('Num_Mtds_L3', 'REAL'),\
                            ('Avg_Wordsize_Cls', 'REAL'),\
                            ('Avg_Distances_Cls', 'REAL'),\
                            ('Num_Cls_L1', 'REAL'),\
                            ('Num_Cls_L2', 'REAL'),\
                            ('Num_Cls_L3', 'REAL'),\
                            ('class', 'REAL')]

    dataset['data'] = []
    if all_features != []:
        for item in all_features:
            dataset['data'].append(item)
    
    if dataset['data'] != []:
        arff.dump(dataset, output_file)
Exemplo n.º 16
0
    def arff_to_big_endian(cls, filename, dataset, n_labels):

        data = Dataset.load_arff(filename, n_labels, endian = "little", input_feature_type = 'float', encode_nominal = True)
        new_data = np.concatenate((data['Y'],data['X']), axis=1)

        arff_frame = arff.load(open(filename,'r'), encode_nominal = True, return_type=arff.DENSE)

        arff_frame['data'] = new_data.tolist()
        # make the labels nominal
        for i in range(data['Y'].shape[0]):
            for j in range(data['Y'].shape[1]):
                arff_frame['data'][i][j] = int(arff_frame['data'][i][j])

        arff_frame['attributes'] = arff_frame['attributes'][-n_labels:] + arff_frame['attributes'][:-n_labels]

        # nominal attributes to int format
        attributes = arff_frame['attributes']
        for j in range(data['Y'].shape[1], data['X'].shape[1] + data['Y'].shape[1]):
            if isinstance(attributes[j][1], list):
                for i in range(data['Y'].shape[0]):
                    arff_frame['data'][i][j] = int(arff_frame['data'][i][j])

                    

        arff_frame['relation'] = dataset + "_mlcsn: -C " + str(n_labels)
        f = open(filename,"w")
        arff.dump(arff_frame, f)
        f.close()
Exemplo n.º 17
0
def _save_split_set(path, name, full_dataset=None, rows=None, cols=None):
    # X_split = X[indexes, :]
    # y_split = y.reshape(-1, 1)[indexes, :]
    log.debug("Saving %s split dataset to %s.", name, path)
    if rows is None:
        rows = slice(None)
    else:
        assert isinstance(rows, list)
        rows = np.array(rows)
    full_attributes = full_dataset['attributes']
    if cols is None:
        cols = slice(None)
        attributes = full_attributes
    else:
        assert isinstance(cols, list)
        cols = np.array(cols)
        attributes = [full_attributes[i] for i in cols]
    if len(attributes) != len(full_attributes):
        log.debug("Keeping only attributes %s", [a for a, _ in attributes])
    with open(path, 'w') as file:
        description = '\n'.join([
            "Split dataset file generated by automlbenchmark.", "",
            full_dataset['description']
        ])
        split_data = np.asarray(full_dataset['data'],
                                dtype=object)[rows[:, None], cols]
        arff.dump(
            {
                'description': description,
                'relation': name,
                'attributes': attributes,
                'data': split_data
            }, file)
Exemplo n.º 18
0
def create_filtered_dataset(file_name, filtered_attacks):
    file = arff.load(open("Datasets/" + file_name + ".arff"))
    original_data = file['data']
    attributes = file['attributes']
    attack_types = get_attack_column(file_name)
    new_data = remove_attacks(
        original_data,
        get_filter_indices(filtered_attacks, "Datasets/" + file_name + ".txt"))
    new_attack_types = remove_attacks(
        attack_types,
        get_filter_indices(filtered_attacks, "Datasets/" + file_name + ".txt"))
    return_arff = {
        'relation': 'KDDFiltered',
        'description': '',
        'data': new_data,
        'attributes': attributes
    }
    arff.dump(return_arff,
              open("Datasets/" + file_name + "_filtered.arff", "w+"))

    file = open("Datasets/" + file_name + "_filtered_attacks", "w+")
    for attack in new_attack_types:
        file.write(attack + "\n")

    file.close()
def get_size_and_dexsize(path_to_predictions, path_to_arff):
    # read predictions
    f = open(path_to_predictions, 'r')
    content = f.readlines()
    f.close()

    error_index = []
    for line in content:
        if '+' in line:
            error_index.append(int(line.split()[0]))

    # generate error list
    f = open(path_to_arff, 'r')
    file = f.read()
    f.close()

    d = arff.loads(file)

    error_list = []
    i = 0
    for index in error_index:
        obj['data'].append(d['data'][index])
        error_list.append({
            'size': d['data'][index][0],
            'dex_size': d['data'][index][1]
        })

    # write error vectors to arff
    f = open('incorrectly_classified.arff', 'w')
    arff.dump(obj, f)
    f.close()

    return error_list
Exemplo n.º 20
0
def getArchivesIDListFromARFF():
    start = int(sys.argv[1])
    end = int(sys.argv[2])
    f = codecs.open('article_merged_2.arff', 'r', encoding='utf8')
    span = 0
    ArticleList = []
    for index, l in enumerate(f):
        if (l[0] == '@'):
            span = span + 1
            continue
        elif (index >= start + span and index < end + span):
            l = l.split(',')
            ID = l[0].replace('"', '')
            a = getArticle(ID)
            if (a == False):
                continue
            ArticleList.append(a.toList())
        else:
            continue
    arff.dump('article_v2_' + str(start) + '_' + str(end) + '.arff',
              ArticleList,
              relation="article",
              names=[
                  'ArchivesID', 'Category', 'Department', 'ReadCount', 'Title',
                  'Content', 'Glossary'
              ])
Exemplo n.º 21
0
    def write_arff(self, arff_file, relation='multi_target'):
        # get string representation of numeric labels (class index or regression target)
        def encode_labels(labels, label_type):
            if str(label_type).upper() == 'NUMERIC':
                return labels
            elif isinstance(label_type, list):  # nominal
                labels_numeric = np.copy(labels)  # copy
                labels_numeric[labels == None] = -1
                label_type_with_missing = label_type + ['?']
                # note: can't index by object type array
                return np.take(label_type_with_missing,
                               labels_numeric.astype(np.int64))
            else:
                raise ValueError("label_type = '%s' not allowed" % label_type)

        fh = open(arff_file, "w")
        arff_data = {
          'data': np.hstack((np.expand_dims(self.instance_names, axis=-1),
                             self.features,
                             np.hstack([np.expand_dims(encode_labels(self.labels[idx], self.target_types[idx]), axis=-1) \
                               for idx in range(len(self.labels))])
                           )),
          'attributes': [('instance_name', 'STRING')] \
            + list(zip(self.feature_names, self.feature_types)) \
            + list(zip(self.target_names, self.target_types)),
          'relation': relation,
          'description': 'multi-target dataset generated by openXData 0.1'
        }
        arff.dump(arff_data, fh)
        fh.close()
Exemplo n.º 22
0
    def predict(self, articles):
        # modifies the provided articles dict

        data = {'attributes': [('title', 'STRING'),
                               ('body', 'STRING'),
                               ('class', ['yes', 'no'])],
                'data': [], 'description': u'', 'relation': '0'}

        for urlid in sorted(articles.keys()):
            title = re.sub(r'\W', ' ', articles[urlid]['title'])
            body = re.sub(r'\W', ' ', articles[urlid]['summary'])
            data['data'].append([title, body, 'no'])

        # make the testing file 0.arff
        fnew = open("%s0.arff" % paths['weka.training_arff_dir'], 'w')
        arff.dump(fnew, data)
        fnew.close()

        predictions = self.__predict_arff()

        for urlid in sorted(articles.keys()):
            articles[urlid]['categories'] = []

        tids = self.__get_tids()
        for tid in sorted(tids):
            for (i, urlid) in enumerate(sorted(articles.keys())):
                if predictions[tid][i][0]:
                     articles[urlid]['categories'].append(str(tid))
Exemplo n.º 23
0
    def write_arff(self, filename):
        instrument_arff = copy.deepcopy(arff_format)
        note_arff = copy.deepcopy(arff_format)
        velocity_arff = copy.deepcopy(arff_format)
        duration_arff = copy.deepcopy(arff_format)
        time_delta_arff = copy.deepcopy(arff_format)

        with open(filename + "_instrument.arff", 'w') as instrument_file, \
                open(filename + "_note.arff", 'w') as note_file, \
                open(filename + "_velocity.arff", 'w') as velocity_file, \
                open(filename + "_duration.arff", 'w') as duration_file, \
                open(filename + "_time_delta.arff", 'w') as time_delta_file:

            for song in self.songs:
                array_instrument, array_note, array_velocity, array_duration, array_time_delta = song.get_arff_arrays(
                )

                instrument_arff['data'] += array_instrument
                note_arff['data'] += array_note
                velocity_arff['data'] += array_velocity
                duration_arff['data'] += array_duration
                time_delta_arff['data'] += array_time_delta

            print "writing file " + instrument_file.name
            arff.dump(instrument_arff, instrument_file)
            print "writing file " + note_file.name
            arff.dump(note_arff, note_file)
            print "writing file " + velocity_file.name
            arff.dump(velocity_arff, velocity_file)
            print "writing file " + duration_file.name
            arff.dump(duration_arff, duration_file)
            print "writing file " + time_delta_file.name
            arff.dump(time_delta_arff, time_delta_file)
def kclustering(top=100, pca=0):
    training = pd.read_csv('documents\csv\drunk\drunk labeling 1300' + '.csv')
    test = pd.read_csv('documents\csv\drunk\drunkTEXT400U' + '.csv')
    main_domain = join(training, 'Clean tweet')
    top = topwords(test, 'Clean tweet', top)
    main_domain = join(training, 'Clean tweet')
    main_domain1 = join(test, 'Clean tweet')
    main_domain.joinall(top.top, 1)
    main_domain1.joinall(top.top, 1)
    training = main_domain.df
    test = main_domain1.df

    cols = ['Clean tweet']

    try:
        for x in cols:
            del training[x]
            del test[x]
    except:
        pass

    print training['L']
    training.L = training.L.replace(['y', 'n'], [True, False])
    test.L = test.L.replace(['y', 'n'], [True, False])
    if pca == 1:

        dftraining, dftest = pcaf(training, test)
        training = dftraining.join(training["L"])
        test = dftest.join(test["L"])

    try:
        training = training.replace(['True', 'False'], [True, False])
        test = test.replace(['True', 'False'], [True, False])
    except:
        pass
    headers_names = list(training.columns.values)
    training = training.astype(np.float64)
    test = test.astype(np.float64)
    training['L'] = training['L'].astype(bool)
    test['L'] = test['L'].astype(bool)
    headers_names.remove('L')
    headers_names.append('L')

    pca = str(pca)
    test = test[headers_names]
    training = training[headers_names]
    TRAINING = training.as_matrix(columns=None)
    TEST = test.as_matrix(columns=None)
    print training.dtypes
    main_domain.df.to_csv(r'documents\csv\unsupervised\test.csv', index=False)
    main_domain.df.to_csv(r'documents\csv\unsupervised\test.csv', index=False)
    arff.dump(r'documents\Arff\unsupervised' + r'\training' + pca + '.arff',
              TRAINING,
              relation="whatever",
              names=headers_names)
    arff.dump(r'documents\Arff\unsupervised' + r'\test' + pca + '.arff',
              TEST,
              relation="whatever",
              names=headers_names)
Exemplo n.º 25
0
 def dump_data_arff(cls, original_filename, destination_filename, X, Y):
     # dump always in big endian
     new_data = np.concatenate((Y,X), axis=1)
     arff_frame = arff.load(open(original_filename,'r'), encode_nominal = True, return_type=arff.DENSE)
     arff_frame['data'] = new_data.astype(int).tolist()
     f = open(destination_filename,"w")
     arff.dump(arff_frame, f)
     f.close()
Exemplo n.º 26
0
def dump2arffnc(data, fname):
    with open(fname, 'wt') as file_pointer:
        arff.dump(data, file_pointer)
    call(['./fixarfffiles.sh', fname])
    call([
        './arff2nc_noorder', fname, '2',
        str(len(targetsToPredict)), fname + '.nc'
    ])
Exemplo n.º 27
0
def create_arff_from_dataframes(df_dic,path):
    for key, val in df_dic.items():
        if int(key) != int(cold_start_index):
            arff.dump(os.path.join(path, str(key) + '.arff')
                      , val.values
                      , relation='relation name'
                      , names=val.columns)
            lines_to_save = None
Exemplo n.º 28
0
    def dump(self, path_or_filehandle):
        output = self._get_arff()

        if isinstance(path_or_filehandle, six.string_types):
            with open(path_or_filehandle, "w") as fh:
                arff.dump(output, fh)
        else:
            arff.dump(output, path_or_filehandle)
Exemplo n.º 29
0
def dump(df,fp):
    """
    dump DataFrame to file
    :param DataFrame df: 
    :param file fp: 
    """
    arff = __dump(df)
    liacarff.dump(arff,fp)
Exemplo n.º 30
0
    def dump(self, path_or_filehandle):
        output = self._get_arff()

        if isinstance(path_or_filehandle, types.StringTypes):
            with open(path_or_filehandle, "w") as fh:
                arff.dump(output, fh)
        else:
            arff.dump(output, path_or_filehandle)
Exemplo n.º 31
0
def write_to_arff_file(pred_Y, predicted_Y_file_name):
    test_X_arff=arff.load(open(test_X_file_path,'r'))
    arff_data={
        'data':pred_Y, 
        'relation':test_X_arff['relation'], 'description':'', 
        'attributes':[('class',['True','False'])]
        }
    with open('./predicted_test_Y_dt.arff','w') as arff_file:
        arff.dump(arff_data,arff_file)
Exemplo n.º 32
0
def libsvm2arff(input_file, out_file):

    X, y = load_svmlight_file(input_file)
    l,c = X.shape
    data = np.zeros((l,c+1))
    data[:,:-1] = X.toarray()
    data[:,c] = y

    arff.dump(out_file, data)
Exemplo n.º 33
0
def libsvm2arff(input_file, out_file):

    X, y = load_svmlight_file(input_file)
    l, c = X.shape
    data = np.zeros((l, c + 1))
    data[:, :-1] = X.toarray()
    data[:, c] = y

    arff.dump(out_file, data)
Exemplo n.º 34
0
def export_arff(data, attributes, filename, relation="Data", description=None):
    exported_arff = {
        'relation': relation,
        'description': description,
        'data': data,
        'attributes': attributes
    }

    arff.dump(exported_arff, open("Datasets/" + filename + ".arff", "w+"))
Exemplo n.º 35
0
def write_feature_file():
	data = read_test_data()
	arff.dump('data.arff',data,relation='stress',names=['token_num']+feature_name+['is_stress'])
	svm_ofile = open("data.svm","w")
	for ins in data:
		line = "+1 " if ins[-1] else "-1 "
		dict_line = [str(index+1)+":"+str(ins[index]) for index in range(0,len(ins)-1) if ins[index]!=0]
		svm_ofile.write(line+" ".join(dict_line)+"\n")
	svm_ofile.close()
Exemplo n.º 36
0
    def __prepare_arff(self, tid):
        p = open("%sbag_of_words-0.pickle" % paths['weka.bag_of_words_dir'],
                 'r')
        bag_title = pickle.load(p)
        p.close()
        p = open("%sbag_of_words-1.pickle" % paths['weka.bag_of_words_dir'],
                 'r')
        bag_body = pickle.load(p)
        p.close()

        data = {
            'attributes': [],
            'data': [],
            'description': u'',
            'relation': tid
        }
        for word in bag_title:
            data['attributes'].append(("title-%s" % word, 'NUMERIC'))
        for word in bag_body:
            data['attributes'].append(("body-%s" % word, 'NUMERIC'))
        data['attributes'].append(('class', ['yes', 'no']))

        f = arff.load(
            open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r'))
        for record in f['data']:
            record_bag_title = self.txtpro.simpletextprocess(0, record[0])
            record_bag_body = self.txtpro.simpletextprocess(0, record[1])
            record_data = []
            # iterate through original bag, figure out freq in this record's bag
            for word in bag_title:
                if word in record_bag_title:
                    record_data.append(record_bag_title[word])
                else:
                    record_data.append(0)
            for word in bag_body:
                if word in record_bag_body:
                    record_data.append(record_bag_body[word])
                else:
                    record_data.append(0)
            record_data.append(record[2])
            data['data'].append(record_data)

        fnew = open("%s%d-wordvec-nonsparse.arff" % \
                        (paths['weka.training_arff_dir'], tid), 'w')
        arff.dump(fnew, data)
        fnew.close()

        # convert to sparse format
        Popen(("java -cp %s weka.filters.unsupervised.instance.NonSparseToSparse " +
               "-i %s%d-wordvec-nonsparse.arff -o %s%d-wordvec.arff") % \
                  (paths['weka.weka_jar'],
                   paths['weka.training_arff_dir'], tid,
                   paths['weka.training_arff_dir'], tid),
              shell = True).communicate()

        remove("%s%d-wordvec-nonsparse.arff" %
               (paths['weka.training_arff_dir'], tid))
def kclustering(top=100,pca=0):
	training=pd.read_csv('documents\csv\drunk\drunk labeling 1300'+'.csv'  )
	test=pd.read_csv( 'documents\csv\drunk\drunkTEXT400U'+'.csv' )
	main_domain = join(training,'Clean tweet')
	top = topwords(test,'Clean tweet',top)
	main_domain = join(training,'Clean tweet')
	main_domain1 = join(test,'Clean tweet')
	main_domain.joinall(top.top,1)
	main_domain1.joinall(top.top,1)
	training=main_domain.df
	test=main_domain1.df


	cols=['Clean tweet']

	try:
		for x in cols:
			del training[x]
			del test[x]
	except:
		pass


	
	print training['L']
	training.L=training.L.replace(['y','n'], [True,False])
	test.L=test.L.replace(['y','n'], [True,False])
	if pca==1:

		dftraining, dftest=pcaf(training,test)
		training =dftraining.join(training["L"])
		test=dftest.join(test["L"])
	
	try:
		training=training.replace(['True','False'], [True,False])	
		test=test.replace(['True','False'], [True,False])
	except:
		pass
	headers_names=list(training.columns.values)
	training=training.astype(np.float64)
	test=test.astype(np.float64)
	training['L']=training['L'].astype(bool)
	test['L']=test['L'].astype(bool)
	headers_names.remove('L')
	headers_names.append('L')
	
	pca=str(pca)
	test = test[headers_names]
	training = training[headers_names]
	TRAINING=training.as_matrix(columns=None)
	TEST=test.as_matrix(columns=None)
	print training.dtypes
	main_domain.df.to_csv(r'documents\csv\unsupervised\test.csv',index=False)
	main_domain.df.to_csv(r'documents\csv\unsupervised\test.csv',index=False)
	arff.dump(r'documents\Arff\unsupervised'+r'\training'+pca+'.arff',TRAINING, relation="whatever", names=headers_names)
	arff.dump(r'documents\Arff\unsupervised'+r'\test'+pca+'.arff',TEST, relation="whatever", names=headers_names)
Exemplo n.º 38
0
def getArticleList():
	ArticleList = []
	for i in range(int(sys.argv[1]), int(sys.argv[2])) :
		print('page '+str(i));
		ArchivesIDList = getArchivesIDList(i)
		for ID in ArchivesIDList:
			a = getArticle(ID)
			ArticleList.append(a.toList())

		arff.dump('article_'+str(i)+'.arff', ArticleList, relation="article", names=['ArchivesID', 'Category', 'Department', 'ReadCount', 'Title', 'Content', 'Glossary'])
Exemplo n.º 39
0
 def write(file_name, data):
     """
     Writes ARFF data dictionary to file.
     :param file_name: File name
     :param data: Data dictionary
     :return:
     """
     f = open(file_name, 'w')
     arff.dump(data, f)
     f.close()
Exemplo n.º 40
0
 def dump_data_arff(cls, original_filename, destination_filename, X, Y):
     # dump always in big endian
     new_data = np.concatenate((Y, X), axis=1)
     arff_frame = arff.load(open(original_filename, 'r'),
                            encode_nominal=True,
                            return_type=arff.DENSE)
     arff_frame['data'] = new_data.astype(int).tolist()
     f = open(destination_filename, "w")
     arff.dump(arff_frame, f)
     f.close()
Exemplo n.º 41
0
def run(args):
    root = logging.getLogger()
    root.setLevel(logging.INFO)
    column_header = [
        'batch_size', 'epochs', 'h_flip',
        'learning_rate_init', 'lr_decay', 'momentum',
        'patience', 'resize_crop', 'shuffle',
        'tolerance', 'v_flip', 'weight_decay'
    ]

    all_results = None
    hyperparameters = None
    for dataset in config_spaces.DATASETS:
        results = pd.read_csv(os.path.join(args.input_dir, dataset, '%s-features.csv' % dataset), header=None, names=column_header)
        accuracy = np.loadtxt(os.path.join(args.input_dir, dataset, '%s-responses-acc.csv' % dataset), delimiter=',')
        runtime = np.loadtxt(os.path.join(args.input_dir, dataset, '%s-responses-time.csv' % dataset), delimiter=',')
        assert results.shape[0] == accuracy.shape[0] == runtime.shape[0]
        results['predictive_accuracy'] = accuracy
        results['runtime'] = runtime
        results['dataset'] = dataset

        config_space = config_spaces.get_config_space(dataset, 0)
        if hyperparameters is None:
            hyperparameters = config_space.get_hyperparameter_names()
        else:
            assert hyperparameters == config_space.get_hyperparameter_names()

        # sanity checks on parameter values
        for hp in config_space.get_hyperparameters():
            if isinstance(hp, ConfigSpace.CategoricalHyperparameter):
                results[hp.name] = results[hp.name].apply(lambda val: hp.choices[val])
            elif isinstance(hp, ConfigSpace.hyperparameters.NumericalHyperparameter):
                for idx, value in enumerate(results[hp.name].values):
                    if not (hp.lower <= value <= hp.upper):
                        raise ValueError('Illegal value for %s at %d: %s' % (hp.name, idx, value))
            else:
                raise ValueError('Hyperparameter type not supported: %s' % hp.name)
        for idx, value in enumerate(results['predictive_accuracy'].values):
            assert 0.0 <= value < 100.0, 'Accuracy iteration %d for dataset %s: %f' % (idx, dataset, value)
        for idx, value in enumerate(results['runtime'].values):
            assert 0.0 < value
        if all_results is None:
            all_results = results
        else:
            all_results = all_results.append(results)
    os.makedirs(args.output_dir, exist_ok=True)
    json_meta = {
        'col_measures': ['predictive_accuracy', 'runtime'],
        'col_parameters': hyperparameters
    }
    arff_dict = openmlcontrib.meta.dataframe_to_arff(all_results, 'fanova-cnn', json.dumps(json_meta))
    output_file = os.path.join(args.output_dir, 'fanova-cnn.arff')
    with open(output_file, 'w') as fp:
        arff.dump(arff_dict, fp)
    logging.info('saved to %s' % output_file)
Exemplo n.º 42
0
def write_arff(header, data, fp, root_dir):
    print('Writing {0}'.format(fp))
    new_arff = {
        'attributes': header,
        'data': data,
        'relation': fp,
        'description': ''
    }

    with open('{0}/{1}.arff'.format(root_dir, fp), "w") as fh:
        arff.dump(new_arff, fh)
def save_arff(df: pd.DataFrame, file_path: Path):
    attributes = [(f"Attr{i}", 'NUMERIC') for i in range(FEATURE_COUNT)]
    attributes += [(f"Class{i}", ['0', '1']) for i in range(CLASS_COUNT)]
    instance_count = df.shape[0]
    arff_data = [df.iloc[i].tolist() for i in range(instance_count)]
    arff_dict = {'attributes': attributes,
                 'data': arff_data,
                 'relation': DATASET_NAME,
                 'description': ''}

    with file_path.open(mode='wt') as file:
        arff_output.dump(obj=arff_dict, fp=file)
Exemplo n.º 44
0
def test_save(df: pd.DataFrame):
    attributes = [(j, 'NUMERIC') if df[j].dtypes in ['int64', 'float64'] else
                  (j, df[j].unique().astype(str).tolist()) for j in df]

    arff_dic = {
        'attributes': attributes,
        'data': df.values,
        'relation': 'myRel',
        'description': ''
    }
    with open("myfile.arff", "w", encoding="utf8") as f:
        arff.dump(arff_dic, f)
    def generate_arff_file(self, file_path, file_name, arff_data):
        """
        Generates arff file
        :param file_name: file_name for arff data
        :param arff_data: dict, arff_data
        :return: string, generated file path
        """

        if not os.path.exists(file_path):
            os.makedirs(file_path)

        arff_file = codecs.open(file_path+file_name, 'w+', encoding='utf-8')
        arff.dump(arff_data, arff_file)
        arff_file.close()
def todo(document1,document2,target,target1,A=1,varydocument=0,joineig=0,undersamplingv=0): #varydocument= 0 it varies the source and Joineig=0 it adds the spectral features
	print 'size: ', 'A= ',A,'eigenvectors=', joineig, 'with or without eigenvectors 1=without 0=with 2=withoutdi'
	spectral=espectralfeature(document1,document2)	
	df, test=spectral.spectralcluster(A,varydocument,joineig,undersamplingv)
	print "PASO 1 COMPLETED"
	headers_names=list(df.columns.values)
	
	cols=['Clean tweet','tweet','url']
	for x in cols:
	 try:	
		del df[x]
		del test[x]
	 except:
	 	pass
	try:
		df=df.replace(['True','False'], [True,False])   
	except:
		pass
	try:
	 test=test.replace(['True','False'], [True,False])    
	except:
		pass
	print headers_names
	headers_names=list(df.columns.values)
	headers_names.remove('L')
	headers_names.append('L')
	print headers_names
	
	
	
	print type(headers_names)
	test = test[headers_names]
	df= df[headers_names]

	A=str(A)
	joineig=str(joineig)
	varydocument=str(varydocument)
	undersamplingv=str(undersamplingv)
	
	df.to_csv(target+'\Training'+A+'.csv',index=False)
	test.to_csv(target+'\Test'+A+'.csv',index=False)
	print "COMPLETED 0", df.dtypes

	TRAINING=df.as_matrix(columns=None)
	print "COMPLETED 0.1"
	arff.dump(target1+r'\training'+A+varydocument+ joineig+undersamplingv+'.arff',TRAINING, relation="whatever", names=headers_names)
	TEST=test.as_matrix(columns=None)	 
	arff.dump(target1+ r'\test'+A+varydocument+joineig+undersamplingv+'.arff',TEST, relation="whatever", names=headers_names)
	print "COMPLETED"
Exemplo n.º 47
0
    def __prepare_arff(self, tid):
        p = open("%sbag_of_words-0.pickle" % paths['weka.bag_of_words_dir'], 'r')
        bag_title = pickle.load(p)
        p.close()
        p = open("%sbag_of_words-1.pickle" % paths['weka.bag_of_words_dir'], 'r')
        bag_body = pickle.load(p)
        p.close()

        data = {'attributes': [], 'data': [], 'description': u'', 'relation': tid}
        for word in bag_title:
            data['attributes'].append(("title-%s" % word, 'NUMERIC'))
        for word in bag_body:
            data['attributes'].append(("body-%s" % word, 'NUMERIC'))
        data['attributes'].append(('class', ['yes', 'no']))

        f = arff.load(open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r'))
        for record in f['data']:
            record_bag_title = self.txtpro.simpletextprocess(0, record[0])
            record_bag_body = self.txtpro.simpletextprocess(0, record[1])
            record_data = []
            # iterate through original bag, figure out freq in this record's bag
            for word in bag_title:
                if word in record_bag_title:
                    record_data.append(record_bag_title[word])
                else:
                    record_data.append(0)
            for word in bag_body:
                if word in record_bag_body:
                    record_data.append(record_bag_body[word])
                else:
                    record_data.append(0)
            record_data.append(record[2])
            data['data'].append(record_data)

        fnew = open("%s%d-wordvec-nonsparse.arff" % \
                        (paths['weka.training_arff_dir'], tid), 'w')
        arff.dump(fnew, data)
        fnew.close()

        # convert to sparse format
        Popen(("java -cp %s weka.filters.unsupervised.instance.NonSparseToSparse " +
               "-i %s%d-wordvec-nonsparse.arff -o %s%d-wordvec.arff") % \
                  (paths['weka.weka_jar'],
                   paths['weka.training_arff_dir'], tid,
                   paths['weka.training_arff_dir'], tid),
              shell = True).communicate()

        remove("%s%d-wordvec-nonsparse.arff" % (paths['weka.training_arff_dir'], tid))
Exemplo n.º 48
0
 def test_files(self):
     fname = os.path.join(SRC_DIR, 'example.arff')
     data = [
         ['blonde', 17.2, 1],
         ['blue', 27.2, 2],
         ['blue', 18.2, 3],
         ]        
     arff.dump(fname, data, relation='diabetics_data', names=('hair_color', 'age', 'patno'))
     data = list(arff.load(os.path.join(SRC_DIR, fname)))
     arff_rows = arff.dumps(data)
     reparsed_data = list(arff.loads(arff_rows))
     
     data = [list(row) for row in data]
     reparsed_data = [list(row) for row in reparsed_data]
     
     self.assertEqual(data, reparsed_data)
Exemplo n.º 49
0
 def generateArffFile(self,datafeatures):
     print "data features length",len(datafeatures)
     try:
         self.features = self.features + self.const.LABEL_FEATURES_GOOD
         # OUTPUT_FILE_TRAIN
         output_file = self.const.OUTPUT_FILE_TRAIN
         if self.mode.lower() == 'test':
             output_file=self.const.OUTPUT_FILE_TEST
         print "generating arff file ", output_file ,"this will take time. please wait. "
         features_underscore = []
         for gram in self.features:
             features_underscore.append(gram.replace(" ","_"))
         arff.dump(output_file, datafeatures, relation="yelp", names=features_underscore)
         print "arff file generation done."
     except:
         print "Error: Generating Arff file. \n Reason: ",sys.exc_info()
Exemplo n.º 50
0
 def saveArff(path, filename, dim, X, y):
     
     data = X.tolist()
     for i, row in enumerate(data):
         row.append(str(y[i]))
         
     attributes = ['centroid_%d'%(i+1) for i in range(X.shape[1])]
     attributes.append('class_name')
     outFilePath= os.path.join(path, filename)
     
     infile = open(outFilePath, 'wb')
     arff.dump(outFilePath, data, relation="whatever", names=attributes)
     
     print '.arff file saved in %s'%outFilePath
     
     
Exemplo n.º 51
0
def main():
    
    verbose = False
    splits = 0
    arff_filename = ''
    
    try:
        optlist, args = getopt.getopt(sys.argv[1:], 'hvs:a:')
    except getopt.GetoptError as err:
        #print str(err)
        usage()
        sys.exit(2)
        
    #print optlist, args
    
    for opt, val in optlist:
        if opt == '-h':
            usage()
            sys.exit(0)
        elif opt == '-v':
            verbose = True
        elif opt == '-s':
            splits = atoi(val)
        elif opt == '-a':
            arff_filename = val 
    
    #print splits, arff_filename
    
    if not arff_filename:
        usage()
        sys.exit(2)
        
    fd = open(arff_filename,'r')
    data = arff.load(fd) 
    fd.close()
    
    if verbose:
        print 'Record read from input file:', len(data['data'])
    
    if splits:
        arff_splits = arff.split(data,splits)
        for s in range(len(arff_splits)):
            filename_base = arff_filename.rsplit('.',1)[0]
            split_filename = '%s_split%d.arff' % (filename_base, s)
            fdsplit = open(split_filename, 'w')
            arff.dump(fdsplit, arff_splits[s])
            fdsplit.close()
Exemplo n.º 52
0
def generate_arquivo(fase):
    jogo_fase = Jogo.objects.all().filter(fase=fase)
    jsn = []
    colunas = ['aluno', 'frustrado','qtd_toques', 'tentativas', 'tempo', 'med_toques_segundo']

    colunas.extend(['qtd_toque_tipo_' + x.__str__() for x in range(13)])

    qtd_max_toq = 0
    for j in jogo_fase:
        toques = j.toques_set.all().order_by('t')
        qtd_toques = toques.count()
        if qtd_max_toq <= qtd_toques :
            qtd_max_toq = qtd_toques
        col_toq = []

        qtd_toque_tipo = [0] * 13
        for t in toques:
            col_toq.append(t.x)
            col_toq.append(t.y)
            col_toq.append(t.t)
            col_toq.append(t.acao)
            qtd_toque_tipo[t.acao] = qtd_toque_tipo[t.acao] + 1

        med_toques_segundo = 0
        if float(j.tempo/100.0) <> 0:
            med_toques_segundo = qtd_toques / float(j.tempo/100.0)

        jsn.append([j.aluno, j.frustrado, qtd_toques, j.tentativas, j.tempo, med_toques_segundo] + qtd_toque_tipo + col_toq)

    ind = 0
    for j in jogo_fase:
        toques = j.toques_set.all().order_by('t')
        qtd_toques = toques.count()
        for i in range(qtd_toques, qtd_max_toq):
            if qtd_toques == qtd_max_toq:
                break
            jsn[ind] = jsn[ind] + [0L, 0L, 0L, 0L]
        ind = ind + 1

    for i in range(1, qtd_max_toq+1):
        colunas.append('toque_' + i.__str__() + '_x')
        colunas.append('toque_' + i.__str__() + '_y')
        colunas.append('toque_' + i.__str__() + '_t')
        colunas.append('toque_' + i.__str__() + '_acao')

    arff.dump('results/result_fase_'+fase.__str__() +'.arff', jsn, relation="jogo_fase_" + fase.__str__(), names=colunas)
Exemplo n.º 53
0
def _select_feature(raw_dict, labels, user_mat):
    # write to arff file
    obj = {}
    obj['relation'] = 'dictionary'
    obj['attributes'] = _generate_att_list(len(raw_dict))
    concat_user_mat = copy.deepcopy(user_mat)
    for ii in range(len(concat_user_mat)):
        concat_user_mat[ii].append(labels[ii])
    obj['data'] = concat_user_mat

    arff_file = open('.tmp.arff', 'w', encoding='utf-8')
    arff.dump(obj, arff_file)

    # use weka to select feature
    ll = os.popen('java -jar FeatureSelect/out/artifacts/FeatureSelect_jar/FeatureSelect.jar .tmp.arff').read()
    selected_index = ll.split()
    return list(fucking_map(lambda index: raw_dict[int(index)], selected_index))
Exemplo n.º 54
0
def predict():
    fields = [
        'stats.totals.pts.value', 
        'stats.totals.ast.value', 
        'stats.totals.trb.value',
        'stats.per_game.pts_per_g.value',
        'stats.per_game.ast_per_g.value',
        'stats.per_game.trb_per_g.value',
        'stats.advanced.per.value', 
    #    'stats.advanced.ws.value', 
    ]
    
    query = {
        #'name': 'Jack McCloskey',
        'new_hof_probability': {'$exists': False},
        'stats.advanced.per': {'$exists': True},
        'stats.advanced.per.complete': True,
    #    'stats.advanced.ws': {'$exists': True},
    #    'stats.advanced.ws.complete': True,
        'stats.totals.pts': {'$exists': True},
        'stats.totals.pts.complete': True,
        'stats.totals.ast': {'$exists': True},
        'stats.totals.ast.complete': True,
        'stats.totals.trb': {'$exists': True},
        'stats.totals.trb.complete': True,
        'stats.per_game.trb_per_g': {'$exists': True},
        'stats.per_game.trb_per_g.complete': True,
        'stats.per_game.pts_per_g': {'$exists': True},
        'stats.per_game.pts_per_g.complete': True,
        'stats.per_game.ast_per_g': {'$exists': True},
        'stats.per_game.ast_per_g.complete': True,
    }

    for p in db_players.find(query):
        logger.info('Player {}'.format(p['name']))
        player = [nested_get(p, f) for f in fields]
        player.append(len(nested_get(p, 'honors.allstar_appearances', [])))
        player.append(len(nested_get(p, 'honors.championships', [])))
        player.append(nested_get(p, 'honors.mvpshares', 0))
        player.append(nested_get(p, 'hall_of_fame'))
        arff.dump('test.arff', [player], relation="nba", names=fields+['honors.allstar_appearances', 'honors.championships', 'honors.mvpshares', 'hall_of_fame'])    
        raw_output = subprocess.check_output('java -cp /Applications/weka-3-6-9/weka.jar weka.classifiers.functions.RBFNetwork -T test.arff -l new.model -p 0'.split())
        prob = parse_probability(raw_output)
        logger.info('Player {name}\'s HOF Probability is {prob}'.format(name = p['name'], prob = prob))
        db_players.update({'_id': p['_id']}, {"$set":{"new_hof_probability": prob}}, safe=True, upsert=True)
def evaluate_apk(permissions, perm_file, model_file):
	fd = open(perm_file,'r')
	perm_list = simplejson.load(fd)
	fd.close()
#	permissions = get_permissions(filename)

	bitmap = perm_bitmap(perm_list, permissions)+[True]

	temp=tempfile.mkstemp(suffix='.arff')
	arff.dump(temp[1],[bitmap], names=perm_list+['Class'])

	output = subprocess.check_output(['java','weka.classifiers.bayes.NaiveBayesUpdateable','-p','0','-T',temp[1],'-l',model_file])	
	#os.remove(temp[1])
	virus =  output.split()[13]=='1:True'
	assurance = output.split()[14]
	if assurance == '+':
		assurance = output.split()[15]
	return (virus, str(assurance))
Exemplo n.º 56
0
def makeArff(filename,handle,opts):
	readdata = False
	dataOut= []
	attributesOut=[]
	relation = filename.split(".")[0] 
	outfile = open(handle,'w')
	with open(filename) as data_file: 
		dataIn = json.load(data_file)		
	for entry in dataIn["data"]:			
		values = []
		attributes = []
		for value in entry:			
			attributes.append(value)
			if isinstance(entry[value], unicode):
				entry[value] = entry[value].encode('ascii','ignore')										
			values.append(entry[value])
		dataOut.append(values)
		attributesOut.append(attributes)	
	arff.dump(outfile, dataOut, relation=relation, names=attributesOut[0])
Exemplo n.º 57
0
def getArchivesIDListFromARFF():
	start = int(sys.argv[1])
	end = int(sys.argv[2])
	f = codecs.open('article_merged_2.arff', 'r', encoding='utf8')
	span = 0
	ArticleList = []
	for index, l in enumerate(f):
		if(l[0] == '@'):
			span = span + 1
			continue
		elif(index >= start + span and index < end + span):
			l = l.split(',')
			ID = l[0].replace('"', '')
			a = getArticle(ID)
			if(a == False):
				continue
			ArticleList.append(a.toList())
		else:
			continue
	arff.dump('article_v2_'+str(start)+'_'+str(end)+'.arff', ArticleList, relation="article", names=['ArchivesID', 'Category', 'Department', 'ReadCount', 'Title', 'Content', 'Glossary'])
Exemplo n.º 58
0
def genArff(arff_file, features, options, relation='rotations'):

    #print features
    #print options
    
    data = []
    keys = []
    featureIndex = -1
    nextRotIndex = 0
    
    for key in options:
        #print "len features[",key,"]",len(features[key])
        sampleKey = key
    #print sampleKey, len(features[sampleKey])
    for i in range(len(features[sampleKey])):
        data.append([])
    
    for feature in options:  
        #if not "next" in feature and not "prev" in feature and feature!="result":
        if feature!="next":   
            keys.append(feature)
            for i, val in enumerate(features[feature]):
                data[i].append(val)
 #   for feature in options:
 #       if "next" in feature
 #   for feature in options:
 #       if feature == "prev":
            
 #       elif feature in ["prev"]
 #           for idx, val in enumerate(features[feature]):
  #              data[idx].append(val)
                
    keys.append('next')
    for i, val in enumerate(data):
        val.append(features['next'][i])
                
    if os.getlogin() == 'scottmcclanahan2002':
        return arff.dump(open(arff_file, 'w'), data, relation, keys)
    return arff.dump(arff_file, data, relation, keys)
Exemplo n.º 59
0
        with open(os.path.join(output_dir_, "description.txt"), "w") as fh:
            for line in description:
                fh.write(line)
                fh.write("\n")

        # Copy feature values and add instance id
        with open(os.path.join(metafeatures_dir,
                               "feature_values.arff")) as fh:
            feature_values = arff.load(fh)

        feature_values['relation'] = scenario_id + "_" + feature_values[
            'relation']

        with open(os.path.join(output_dir_, "feature_values.arff"),
                  "w") as fh:
            arff.dump(feature_values, fh)

        # Copy feature runstatus and add instance id
        with open(os.path.join(metafeatures_dir,
                               "feature_runstatus.arff")) as fh:
            feature_runstatus = arff.load(fh)

        feature_runstatus['relation'] = scenario_id + "_" + \
                                        feature_runstatus['relation']

        with open(os.path.join(output_dir_, "feature_runstatus.arff"), "w") \
                as fh:
            arff.dump(feature_runstatus, fh)

        # Copy feature runstatus and add instance id
        with open(
                                 ('repetition', 'NUMERIC')] + \
        [('%s' % name, 'NUMERIC') for name in metafeature_values.columns]
    arff_object['relation'] = "FEATURE_VALUES"
    arff_object['description'] = ""

    data = []
    for idx in metafeature_values.index:
        line = [idx, 1]
        line += [value if np.isfinite(value) else None
                 for value in metafeature_values.ix[idx,:].values]
        data.append(line)
    arff_object['data'] = data

    with open(os.path.join(args.output_directory, "feature_values.arff"),
              "w") as fh:
        arff.dump(arff_object, fh)

    # Feature steps and runtimes according to the aslib1.0 format
    feature_steps = defaultdict(list)
    metafeature_names = list()
    for metafeature_name in metafeatures.metafeatures.functions:
        dependency = metafeatures.metafeatures.get_dependency(metafeature_name)
        if dependency is not None:
            feature_steps[dependency].append(metafeature_name)
        feature_steps[metafeature_name].append(metafeature_name)

        metafeature_names.append(metafeature_name)

    # Write the feature runstatus in the aslib1.0 format
    arff_object = dict()
    arff_object['attributes'] = [('instance_id', 'STRING'),