def get_data(f='net/nettalk.data.txt', size=2): f = open(f) words = [process_line(line)[2:] for line in f] X, y = [], [] vect_syl = DictVectorizer() vect_stress = DictVectorizer() vect_syl.feature_names_ = set() vect_stress.feature_names_ = set() for word, stress in words: if word is None: continue if len(word.strip().replace('-', '')) != len(stress): print >> sys.stderr, "Skipped %s" % word continue x_dict_syl, x_dict_stress, y_syl, y_stress = word_to_feature_dict( word.strip(), stress, size=size) for x in x_dict_syl: for f, v in x.iteritems(): if isinstance(v, (str, unicode)): f = "%s%s%s" % (f, vect_syl.separator, v) vect_syl.feature_names_.add(f) for x in x_dict_stress: for f, v in x.iteritems(): if isinstance(v, (str, unicode)): f = "%s%s%s" % (f, vect_stress.separator, v) vect_stress.feature_names_.add(f) vect_syl.feature_names_ = sorted(vect_syl.feature_names_) vect_syl.vocabulary_ = dict((f, i) for i, f in enumerate(vect_syl.feature_names_)) vect_stress.feature_names_ = sorted(vect_stress.feature_names_) vect_stress.vocabulary_ = dict((f, i) for i, f in enumerate(vect_stress.feature_names_)) for word, stress in words: if word is None: continue x_dict_syl, x_dict_stress, y_syl, y_stress = word_to_feature_dict( word, stress, size=size) if not len(x_dict_syl): print >> sys.stderr, "Empty features for {}".format(word) continue X.append((vect_syl.transform(x_dict_syl), vect_stress.transform(x_dict_stress))) where_stress = y_stress.argmax() if y_stress[where_stress] == 1: y_stress[where_stress + 1:] = 2 y_syl += 1 y.append(np.r_[y_syl, y_stress]) return X, y
def load(conn, table, feature_column='feature', weight_column='weight', bias_feature=None): df = conn.fetch_table(table) intercept = np.array([0.]) # (1,) coef = np.array([[]]) # (1, n_feature) vocabulary = {} feature_names = [] j = 0 for i, row in df.iterrows(): feature, weight = row[feature_column], row[weight_column] if feature == bias_feature: intercept[0] = float(weight) continue coef = np.append(coef, [[weight]], axis=1) vocabulary[feature] = j j += 1 feature_names.append(feature) vectorizer = DictVectorizer(separator='#') vectorizer.vocabulary_ = vocabulary vectorizer.feature_names_ = feature_names return coef, intercept, vectorizer
def feature_dict_to_dict_vectorizer(feature_dict): # Convert to DictVectorizer from sklearn.feature_extraction import DictVectorizer DV = DictVectorizer(sparse=True) T2I = {} feature_names = [] vocab = {} for k, v in feature_dict.iteritems(): if k not in vocab: if not k.__contains__('='): T2I[k] = v else: vocab[k] = v feature_names.append(k) if DV.sort: feature_names.sort() DV.feature_names_ = feature_names DV.vocabulary_ = vocab return T2I, DV
def create_dict_vectorizer(vocab): """ """ ngram_to_idx = dict((n, i) for i, n in enumerate(sorted(vocab))) _count2vec = DictVectorizer(separator=":") _count2vec.vocabulary_ = ngram_to_idx.copy() rev_dict = dict((y, x) for x, y in ngram_to_idx.items()) _count2vec.feature_names_ = [rev_dict[i] for i in range(len(rev_dict))] return _count2vec
def deserialize_dict_vectorizer(model_dict): model = DictVectorizer() model.dtype = np.dtype(model_dict['dtype']).type model.separator = model_dict['separator'] model.sparse = model_dict['sparse'] model.sort = model_dict['sort'] model.feature_names_ = model_dict['feature_names'] model.vocabulary_ = model_dict['vocabulary'] return model
def initialize_vectorizer(vocabulary): """ Initialize a vectorizer that transforms a counter dictionary into a sparse vector of counts (with a uniform feature index) """ ## Isolate Terms, Sort Alphanumerically ngram_to_idx = dict((t, i) for i, t in enumerate(vocabulary)) ## Create Dict Vectorizer _count2vec = DictVectorizer(separator=":", dtype=int) _count2vec.vocabulary_ = ngram_to_idx.copy() rev_dict = dict((y, x) for x, y in ngram_to_idx.items()) _count2vec.feature_names_ = [rev_dict[i] for i in range(len(rev_dict))] return _count2vec
def __call__(self): aux_data = pickle.load(open(self.feature_map_file, "rb")) model: SGDClassifier = pickle.load(open(self.model_file_name, "rb")) frequent_words = aux_data[TrainModel.FREQUENT_WORDS] vectorizer = DictVectorizer() vectorizer.vocabulary_ = aux_data[TrainModel.FEATURE_IDXS] vectorizer.feature_names_ = aux_data[TrainModel.FEATURE_NAMES] tagged_sentences = [] with open(self.input_file_name, 'r') as in_f: lines = [line.rstrip() for line in in_f.readlines()] already_tagged = all( map(lambda l: all(map(lambda w: '/' in w, l.split(' '))), lines)) print('input already tagged:', already_tagged) sentences = [ ExtractFeatures.split_by_whitespace_and_seperate_tags(l) for l in lines ] sentences = list(map(lambda s: list(map(lambda t: t[0], s)), sentences)) sentences_with_idxs = [(s, i) for (i, s) in enumerate(sentences)] sentences = sorted(sentences_with_idxs, key=lambda t: len(t[0])) idxs_processed = [] for l, g in itertools.groupby(sentences, key=lambda t: len(t[0])): g = list(g) sents_of_len_l = np.asarray(list(map(operator.itemgetter(0), g))) idxs_of_len_l = list(map(operator.itemgetter(1), g)) idxs_processed.extend(idxs_of_len_l) tags_of_len_l = np.empty(sents_of_len_l.shape, dtype="U8") for i in range(l): feats_for_ith_word = [] for sent_i, word in enumerate(sents_of_len_l[:, i]): feats = ExtractFeatures.extract( sents_of_len_l[sent_i, :], tags_of_len_l[sent_i, :], i, (word not in frequent_words)) feats_for_ith_word.append(feats) X = vectorizer.transform(feats_for_ith_word) tags_pred = model.predict(X) tags_of_len_l[:, i] = tags_pred tagged_sents_of_len_l = np.char.add( np.char.add(sents_of_len_l, '/'), tags_of_len_l) tagged_sentences.extend( [' '.join(row) for row in tagged_sents_of_len_l]) tagged_sentences = map( operator.itemgetter(0), sorted(zip(tagged_sentences, idxs_processed), key=operator.itemgetter(1))) tagged_sentences = [ w.replace('$EQ$', '=') for w in (s for s in tagged_sentences) ] with open(self.output_file, 'w+') as out_f: out_f.write('\n'.join(tagged_sentences) + '\n')
def prepareVectors(self, featureslist, classlist=None, vectortransformation=None, featureselection=None, features_pct=50, returnasmatrix=False, filter_features=None, returnindexed=False): """ Takes a list of dictionaries (representing units), in which keys are words and values their occurence within the unit. Vector can be transformed to tfidf or binomial. Featureselection selects a percentage of the featurelist (features_pct) based on a score for the relevance of the feature (e.g., chi2). Note that for certain feature selection methods (e.g., chi2), a list of class labels to match the units in the featureslist needs to be provided. If filter_features is a list of feature names, only these features are used. A tuple is returned with the features (as dictionary or sparse matrix) and a list of the selected features. (these selected features can be used as input for 'filter_features' to match new vectors to the vectors on which a classifier is trained) """ dv = DictVectorizer() fmatrix = dv.fit_transform(featureslist) fnames = dv.feature_names_ if vectortransformation: print('- Transforming vectors') fmatrix = self.transformVectors(fmatrix, vectortransformation) if featureselection and not filter_features: print('- Selecting features') fmatrix, fnames = self.selectFeatures(fmatrix, fnames, featureselection, classlist, features_pct) dv.feature_names_ = fnames # store new index of featurenames for dv.inverse_transform if filter_features: print('- Filtering features') fmatrix, fnames = self.filterFeatures(fmatrix, fnames, filter_features) dv.feature_names_ = fnames if returnasmatrix == False: if returnindexed == True: dv.feature_names_ = range(0,len(fnames)) return (dv.inverse_transform(fmatrix), fnames) else: return (fmatrix, fnames)
def _create_dict_vectorizer(self, vocab): """ Create a DictVectorizer object given a list of vocabulary Args: vocab (iterable): Sorted list of feature names for the Vectorizer Returns: _count2vec (DictVectorizer): Count -> csr_matrix Class """ feature_to_idx = dict((n, i) for i, n in enumerate(vocab)) _count2vec = DictVectorizer(separator=":") _count2vec.vocabulary_ = feature_to_idx.copy() rev_dict = dict((y, x) for x, y in feature_to_idx.items()) _count2vec.feature_names_ = [rev_dict[i] for i in range(len(rev_dict))] return _count2vec
def learn_feature_matrix(self, triples, person_abs, vectorizer): """ Learns a sparse DictVectorizer object representing a feature matrix that can be used in building machine learning models. Runs through all the triples, looking up their abstracts in provided abstract cache (person_abs) and creates feature dictionary for each triple. Feature dictionaries from all triples in the cup are then used to create a DictVectorizer object, which represents the abstract-based feature matrix. """ def _get_feature_dict(triple): sub, obj = triple obj_idx = vectorizer.target_idx_cache.get(obj) if obj_idx is None or sub not in person_abs: return dict( ) # no tokens/abstract available for this subject. return empty dictionary abs_tokens = person_abs[sub] d = {} for token in abs_tokens: if token in vectorizer.top_feature_idx and token not in d: d[token] = vectorizer.td_mat[ obj_idx, vectorizer.top_feature_idx[token]] return d print 'Creating feature dictionaries..', sys.stdout.flush() # create list of feature dictionaries, one for each triple D = [] t1 = time() person_no_features = 0 for i, triple in enumerate(triples): d = _get_feature_dict(triple) if len(d) == 0: person_no_features += 1 D.append(d) print '#People w/o features: {}. Time: {:.2f}s'.format( person_no_features, time() - t1) # create a sparse DictVectorizer object, represeting the feature matrix dvec = DictVectorizer(sparse=True) dvec.feature_names_ = vectorizer.top_feature_idx.keys() dvec.dvec_mat = dvec.fit_transform(D) return dvec
def load_topic_scores(corpus, num_topics): topic_vec = DictVectorizer(sparse=True, dtype=float) with open("data/{}/output/T{}/init/model.docs".format(corpus, num_topics)) as f: f.readline() score = parse_topic_scores(f.readline()) topic_vec.vocabulary_ = { k: i for i, k in enumerate(sorted(score[1].keys())) } topic_vec.feature_names_ = sorted(score[1].keys()) with open("data/{}/output/T{}/init/model.docs".format(corpus, num_topics)) as f: f.readline() X_topics = topic_vec.transform( (d[1] for d in sorted((parse_topic_scores(l) for l in f.readlines()), key=lambda x: x[0]))) return topic_vec, X_topics
def _initialize_dict_vectorizer(vocabulary): """ Initialize a vectorizer that transforms a counter dictionary into a sparse vector of counts (with a uniform feature index) Args: vocabulary (iterable): Input vocabulary Returns: _count2vec (DictVectorizer): Transformer """ ## Sort vocabulary = sorted(vocabulary) ## Initialize Vectorizer _count2vec = DictVectorizer(separator=":", dtype=int, sort=False) ## Update Attributes _count2vec.vocabulary_ = dict((x, i) for i, x in enumerate(vocabulary)) _count2vec.feature_names_ = vocabulary return _count2vec
def load_from_json(self, fname): # load the model import_data = json_tricks.load(open(fname)) import_clf = ModifiedNB() import_clf.class_count_ = import_data['class_count_'] import_clf.class_log_prior_ = import_data['class_log_prior_'] import_clf.classes_ = import_data['classes_'] import_clf.feature_count_ = import_data['feature_count_'] import_clf.feature_log_prob_ = import_data['feature_log_prob_'] self.clf = import_clf # load the fps dict vectoriser v_fps = DictVectorizer() dv = import_data['fps_vectoriser'] v_fps.vocabulary_ = {int(k): v for k, v in dv['vocabulary_'].items()} v_fps.feature_names_ = dv['feature_names_'] self.v_fps = v_fps # load the continous variables binariser try: binariser = import_data['binariser'] kbd = KBinsDiscretizer(n_bins=10, encode='onehot', strategy='quantile') kbd.n_bins = binariser['n_bins'] kbd.n_bins_ = binariser['n_bins_'] kbd.bin_edges_ = np.asarray( [np.asarray(x) for x in binariser['bin_edges_']]) encoder = OneHotEncoder() encoder.categories = binariser['categories'] encoder._legacy_mode = False kbd._encoder = encoder self.kbd = kbd except Exception as e: pass # extra parameters self.trained = True self.con_desc_list = import_data['con_desc_list'] self.fp_type = import_data['fp_type'] self.fp_radius = import_data['fp_radius'] self.informative_cvb = import_data['informative_cvb']
x.append(features) word_stripped = word.replace("-", "") return ( x, [_build_feature_dict(word_stripped, k, size, size) for k in xrange(len(word_stripped))], # (np.array(y) == 0).astype(int), np.array(y, dtype=int) + 2, np.array(stress, dtype=int), ) if __name__ == "__main__": X_train, y_train = [], [] vect_syl = DictVectorizer(sparse=True) vect_stress = DictVectorizer(sparse=True) vect_syl.feature_names_ = set() vect_stress.feature_names_ = set() # fit vectorizers for _, word, stress in syllabifications("../silabe.train.xml", 10): if len(word.strip().replace("-", "")) != len(stress): print >> sys.stderr, "Skipped %s" % word continue x_dict_syl, x_dict_stress, y_syl, y_stress = word_to_feature_dict(word.strip(), stress, size=4) for x in x_dict_syl: for f, v in x.iteritems(): if isinstance(v, (str, unicode)): f = "%s%s%s" % (f, vect_syl.separator, v) vect_syl.feature_names_.add(f) for x in x_dict_stress: for f, v in x.iteritems(): if isinstance(v, (str, unicode)):
if len(left_feature) == k + 1: features['%s-%s' % (-i - 1, -i - k - 1)] = left_feature x.append(features) word_stripped = word.replace('-', '') return (x, [_build_feature_dict(word_stripped, k, size, size) for k in xrange(len(word_stripped))], #(np.array(y) == 0).astype(int), np.array(y, dtype=int) + 2, np.array(stress, dtype=int)) if __name__ == '__main__': X_train, y_train = [], [] vect_syl = DictVectorizer(sparse=True) vect_stress = DictVectorizer(sparse=True) vect_syl.feature_names_ = set() vect_stress.feature_names_ = set() # fit vectorizers for _, word, stress in syllabifications('../silabe.train.xml', 10): if len(word.strip().replace('-', '')) != len(stress): print >> sys.stderr, "Skipped %s" % word continue x_dict_syl, x_dict_stress, y_syl, y_stress = word_to_feature_dict( word.strip(), stress, size=4) for x in x_dict_syl: for f, v in x.iteritems(): if isinstance(v, (str, unicode)): f = "%s%s%s" % (f, vect_syl.separator, v) vect_syl.feature_names_.add(f) for x in x_dict_stress: for f, v in x.iteritems():
def vectorizeExamples(examples, featureGroups=None, sparseLabels=False, idPath=None): print "Vectorizing examples" mlb = MultiLabelBinarizer(sparse_output=sparseLabels) if "predictions" in examples and examples["predictions"] != None: print "Vectorizing predictions" assert idPath == None #examples["labels"] = mlb.fit_transform(examples["labels"]) #examples["predictions"] = examples["labels"] numLabels = len(examples["labels"]) vector = mlb.fit_transform(examples["labels"] + examples["predictions"]) examples["labels"] = vector[:numLabels, :] examples["predictions"] = vector[numLabels:, :] print "Vectorized predictions", (examples[x].shape[1] for x in ("labels", "predictions")) else: if idPath != None: labelIdPath = os.path.join(idPath, "labels.tsv") print "Vectorizing labels with existing ids from", labelIdPath labelNames = loadIdNames(labelIdPath) mlb = MultiLabelBinarizer( [labelNames[x] for x in sorted(labelNames.keys())], sparse_output=sparseLabels) mlb.fit(set(labelNames.values())) examples["labels"] = mlb.transform(examples["labels"]) else: print "Vectorizing labels with new ids" examples["labels"] = mlb.fit_transform(examples["labels"]) examples["label_names"] = mlb.classes_ if "features" in examples: print "Vectorizing features" dv = DictVectorizer(sparse=True) if idPath != None: featureIdPath = os.path.join(idPath, "features.tsv.gz") print "Vectorizing features with existing ids from", featureIdPath featureNames = loadIdNames(featureIdPath) #dv.fit([featureNames]) dv.feature_names_ = [ featureNames[x] for x in sorted(featureNames.keys()) ] dv.vocabulary_ = dict( (f, i) for i, f in enumerate(dv.feature_names_)) examples["features"] = dv.transform(examples["features"]) else: print "Vectorizing features with new ids" examples["features"] = dv.fit_transform(examples["features"]) examples["feature_names"] = dv.feature_names_ else: examples["feature_names"] = [] if featureGroups != None and "select" in featureGroups: threshold = .1 print "Selecting features", examples["features"].shape[1] examples["features"] = VarianceThreshold( threshold * (1 - threshold)).fit_transform(examples["features"]) print "Selected features", examples["features"].shape[1] #examples["features"] = SelectKBest(chi2, k=1000).fit_transform(examples["features"], examples["labels"]) print "Vectorized", examples["labels"].shape[0], "examples with", len( examples["feature_names"]), "unique features and", len( examples["label_names"]), "unique labels", ("(sparse)" if sparseLabels else "")
-0.02934711, 0.05490663, 0.02008552, 0.05069223, 0., 0.2016651, -0.28770706, -0.88722735, -0.26507582, 0.52628048, -1.28404466, -1.96447254, 0.07607324, 0.70359565, 0.35094977, 0.01376572 ]]) _clf.classes_ = array([False, True]) _clf.intercept_ = [-2.75918545] _v = DictVectorizer() _v.feature_names_ = [ 'first_chars= ', 'first_chars="a', "first_chars=' ", "first_chars='A", 'first_chars=(0', 'first_chars=(A', 'first_chars=(a', 'first_chars=)]', 'first_chars=, ', 'first_chars=. ', 'first_chars=0', 'first_chars=0 ', 'first_chars=0,', 'first_chars=0.', 'first_chars=00', 'first_chars=0:', 'first_chars=0\\', 'first_chars=@', 'first_chars=A', 'first_chars=A ', 'first_chars=A,', 'first_chars=A-', 'first_chars=A.', 'first_chars=A0', 'first_chars=A=', 'first_chars=AA', 'first_chars=Aa', 'first_chars=[0', 'first_chars=[A', 'first_chars=[a', 'first_chars=\\A', 'first_chars=a ', 'first_chars=a(', 'first_chars=a-', 'first_chars=a.', 'first_chars=a0', 'first_chars=aA', 'first_chars=a[', 'first_chars=aa', 'isalpha', 'isdigit', 'islower', 'mean_len', 'prev_len', 'punct= ', 'punct="', 'punct=%', 'punct=(', 'punct=)', 'punct=*', 'punct=,', 'punct=-', 'punct=.', 'punct=:', 'punct=;', 'punct=@', 'punct=]', 'this_len' ] _v.vocabulary_ = { 'first_chars= ': 0, 'first_chars="a': 1, "first_chars=' ": 2, "first_chars='A": 3, 'first_chars=(0': 4, 'first_chars=(A': 5, 'first_chars=(a': 6,