def affinity(infinitives): print "Extracting features..." X, _ = extract_features(infinitives, 3, False) X_norms = np.sum(X * X, axis=1) S = -X_norms[:, np.newaxis] - X_norms[np.newaxis, :] + 2 * np.dot(X, X.T) p = 10 * np.median(S) print "Fitting affinity propagation clustering..." af = AffinityPropagation().fit(S, p) indices = af.cluster_centers_indices_ for i, idx in enumerate(indices): print i, infinitives[idx] n_clusters_ = len(indices) print "Fitting PCA..." X = RandomizedPCA(2).fit(X).transform(X) print "Plotting..." pl.figure(1) pl.clf() colors = cycle('bgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = af.labels_ == k cluster_center = X[indices[k]] pl.plot(X[class_members,0], X[class_members,1], col+'.') pl.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) for x in X[class_members]: pl.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) pl.title('Estimated number of clusters: %d' % n_clusters_) pl.show()
def k_clusters(k, infinitives): data, _ = extract_features(infinitives, 3, False) kmeans = KMeans(k=k).fit(data) print kmeans.inertia_ nn = NeighborsClassifier(1).fit(data, np.zeros(data.shape[0])) _, idx = nn.kneighbors(kmeans.cluster_centers_) for inf in infinitives[idx.flatten()]: print inf
def plot_projection(model, infinitives, title): fig = pl.figure() # Binary model: n-gram appears or not for i in range(1, 4): # n-gram length (1 to 3) pl.subplot(2, 3, i) data, _ = extract_features(infinitives, i, False) projected_data = model.fit(data).transform(data) pl.scatter(projected_data[:, 0], projected_data[:, 1]) pl.title('Binary %d-grams' % i) # Frequency model: count the occurences for i in range(1, 4): pl.subplot(2, 3, 3 + i) data, _ = extract_features(infinitives, i, True) projected_data = model.fit(data).transform(data) pl.scatter(projected_data[:, 0], projected_data[:, 1]) pl.title('Count %d-grams' % i) fig.text(.5, .95, title, horizontalalignment='center') pl.figlegend() pl.show()
def getName(): input = json.loads(request.data)['text'] tkn_txt = preprocess.tokenize_text(input) while ("" in tkn_txt): tkn_txt.remove("") X = [preprocess.extract_features(preprocess.pos_tagger([tkn_txt])[0])] pred = predict.predict_label(X) inputArray = input.split(' ') respObj = { "data": [{ 'word': w, 'isName': p } for w, p in zip(inputArray, pred[0])] } return jsonify(respObj)
def get_features(self): i_channel_data = self.i_channel_data i_start = i_channel_data - self.classification_length if i_start < 0: length_from_end = -i_start length_from_start = self.classification_length - length_from_end channel_data_copy = np.hstack( (self.channel_data[:, 0:length_from_start], self.channel_data[:, self.channel_data.shape[1] - length_from_end:])) else: channel_data_copy = self.channel_data[:, i_start:i_start + self.classification_length] features, _ = extract_features(channel_data_copy) return features.flatten()
def get_inputs(dom): soup = BeautifulSoup(dom, 'html5lib') # remove unrelated tags for invisible_tag in ['style', 'script', '[document]', 'head', 'title']: for tag in soup.find_all(invisible_tag): tag.decompose() # special process for nested span uncaught in off-line forms ... for element in soup.find_all(): if element.attrs: if element.has_attr('unselectable') and 'on' in element['unselectable']: element.clear() continue input_list = [] for input_type in input_types: inputs = soup.find_all('input', attrs={'type': input_type}) for my_input in inputs: if is_invisible(my_input): continue xpath = get_xpath(my_input) feature = extract_features(my_input) feature = ' '.join(sorted(feature.split())) input_list.append((feature, xpath)) return input_list
import preprocess tkn_txt = preprocess.tokenize_text('My name is Saikat') while("" in tkn_txt) : tkn_txt.remove("") print(preprocess.pos_tagger([tkn_txt])) X = [preprocess.extract_features(preprocess.pos_tagger([tkn_txt])[0])] print(preprocess.predict_label(X))