def init_feature_list(): logging.info("init feature list") buf = [] for col in BIN_COLS: buf.append( Feature(name=col, prefix=col, startid=1, type=FeatureType.BIN, drop=False)) for col in VAL_COLS: buf.append( Feature(name=col, prefix=col, startid=1, type=FeatureType.VAL, drop=False)) return buf
class D(BaseModel, self.Settings): stream = Feature(TextStream, store=True) length = Feature( CharacterCountNonGeneratorProcessMethod, needs=stream, store=True) total = Feature(Total, needs=length, store=True)
def dryer_data2(*feature_names): # data[area][genus][(feature_values)] = langauge_count data = {} # Languages that all features have languages = set() g = Genealogy() feature = Feature(feature_names[0]) for language in feature.languages(): languages.add(language.code) for feature_name in feature_names: feature = Feature(feature_name) this_set = set() for language in feature.languages(): this_set.add(language.code) languages &= this_set for language_code in languages: language = g.find_language_by_code(language_code) area = language.area genus = language.genus.name value = ','.join(v['description'] for v in sorted(language.features.values())) data.setdefault(area, {}) data[area].setdefault(genus, {}) data[area][genus].setdefault(value, 0) data[area][genus][value] += 1 return data
class Split(BaseModel, self.Settings): stream = Feature(TextStream, store=False) uppercase = Feature( ToUpper, needs=stream, store=True, persistence=settings1) lowercase = Feature( ToLower, needs=stream, store=True, persistence=settings2) cat = Feature( Concatenate, needs=[uppercase, lowercase], store=False)
def all_features(parse_dict, constituent, i, constituents): syntax_tree = constituent.syntax_tree conn_category = Connectives_dict().conn_category connective = constituent.connective ''' feat dict ''' feat_dict_CON_Str = {} feat_dict_CON_LStr = {} feat_dict_NT_Ctx = {} feat_dict_CON_NT_Path = {} feat_dict_CON_NT_Path_iLsib = {} ''' load dict ''' dict_CON_Str = NT_dict().dict_CON_Str dict_CON_LStr = NT_dict().dict_CON_LStr dict_NT_Ctx = NT_dict().dict_NT_Ctx dict_CON_NT_Path = NT_dict().dict_CON_NT_Path dict_CON_NT_Path_iLsib = NT_dict().dict_CON_NT_Path_iLsib ''' feature ''' conn_indices = connective.token_indices DocID = connective.DocID sent_index = connective.sent_index conn_node = dict_util.get_conn_node(syntax_tree, conn_indices) CON_Str = dict_util.get_CON_Str(parse_dict, DocID, sent_index, conn_indices) CON_LStr = CON_Str.lower() CON_Cat = conn_category[connective.name] CON_iLSib = dict_util.get_CON_iLSib(syntax_tree, conn_node) CON_iRSib = dict_util.get_CON_iRSib(syntax_tree, conn_node) NT_Ctx = dict_util.get_NT_Ctx(constituent) CON_NT_Path = dict_util.get_CON_NT_Path(conn_node, constituent) CON_NT_Position = dict_util.get_CON_NT_Position(conn_node, constituent) if CON_iLSib > 1: CON_NT_Path_iLsib = CON_NT_Path + ":>1" else: CON_NT_Path_iLsib = CON_NT_Path + ":<=1" features = [] features.append(get_feature(feat_dict_CON_Str, dict_CON_Str, CON_Str)) features.append(get_feature(feat_dict_CON_LStr, dict_CON_LStr, CON_LStr)) features.append(get_feature(feat_dict_NT_Ctx, dict_NT_Ctx, NT_Ctx)) features.append( get_feature(feat_dict_CON_NT_Path, dict_CON_NT_Path, CON_NT_Path)) features.append( get_feature(feat_dict_CON_NT_Path_iLsib, dict_CON_NT_Path_iLsib, CON_NT_Path_iLsib)) # cat dict_category = {"subordinator": 1, "coordinator": 2, "adverbial": 3} features.append(get_feature({}, dict_category, CON_Cat)) #number features.append(Feature("", 1, {1: CON_iLSib})) features.append(Feature("", 1, {1: CON_iRSib})) #position dict_position = {"right": 1, "left": 2} features.append(get_feature({}, dict_position, CON_NT_Position)) return util.mergeFeatures(features)
class D2(BaseModel, self.Settings): stream = Feature(TextStream, store=True) words = Feature(Tokenizer, needs=stream, store=False) count = JSONFeature(WordCount, needs=words, store=True) timestamp = JSONFeature( TimestampEmitter, version='2', needs=stream, store=True)
class Contrived(BaseModel, self.Settings): stream1 = Feature(TextStream, store=False) stream2 = Feature(TextStream, store=False) t1 = Feature(Tokenizer, needs=stream1, store=False) t2 = Feature(Tokenizer, needs=stream2, store=False) count1 = JSONFeature(WordCount, needs=t1, store=True) count2 = JSONFeature(WordCount, needs=t2, store=True) aggregate = JSONFeature( \ WordCountAggregator, needs=[count1, count2], store=True)
class D1(BaseModel, self.Settings): stream = Feature(TextStream, store=True) words = Feature(Tokenizer, needs=stream, store=False) count = JSONFeature(WordCount, needs=words, store=True) timestamp = JSONFeature( TimestampEmitter, version='1', needs=stream, store=True) validated = Feature(ValidatesDependencies, needs=stream, store=True)
def main(): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) root = Feature('root') featureList = np.array([]) for i in range(len(X[0])): feature = Feature('feature_%d' % i) root.transform('init', feature) featureList = np.append(featureList, feature) model = PCA(n_components=1) model.fit(X) doWithPCA(model, featureList) root.printTree()
def NT_curr_next_level_distance(parse_dict, constituent, i, constituents): if i == len(constituents) - 1: return Feature("", 1, {1: 100}) curr = constituents[i].node next = constituents[i + 1].node syntax_tree = constituent.syntax_tree root_node = syntax_tree.tree.get_tree_root() curr_level = int(syntax_tree.tree.get_distance(root_node, curr)) next_level = int(syntax_tree.tree.get_distance(root_node, next)) return Feature("", 1, {1: next_level - curr_level})
def main(): X = [[1, 2], [2, 3]] root = Feature('root') featureList = np.array([]) for i in range(len(X[0])): feature = Feature('feature_%d' % i) root.transform('init', feature) featureList = np.append(featureList, feature) model = OneHotEncoder(n_values=[5, 8], sparse=True) model.fit(X) doWithOneHotEncoder(model, featureList) root.printTree()
def NT_prev_curr_level_distance(parse_dict, constituent, i, constituents): if i == 0: return Feature("", 1, {1: 100}) curr = constituents[i].node prev = constituents[i - 1].node syntax_tree = constituent.syntax_tree root_node = syntax_tree.tree.get_tree_root() curr_level = int(syntax_tree.tree.get_distance(root_node, curr)) prev_level = int(syntax_tree.tree.get_distance(root_node, prev)) return Feature("", 1, {1: curr_level - prev_level})
def main(): from sklearn.feature_selection import VarianceThreshold X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]] root = Feature('root') featureList = np.array([]) for i in range(len(X[0])): feature = Feature('feature_%d' % i) root.transform('init', feature) featureList = np.append(featureList, feature) model = VarianceThreshold() model.fit(X) doWithSelector(model, featureList) root.printTree()
def __init__(self): self.train_file = FILE_PATH + '/../data/conll.nonexp.train' self.test_file = FILE_PATH + '/../data/conll.nonexp.test' self.model_file = FILE_PATH + '/../data/conll.nonexp.model' self.predicted_file = FILE_PATH + '/../data/conll.nonexp.test.predicted' self.feat_handle = Feature()
def prev_curr_some_clause(parse_dict, constituent, i, constituents): # feature connective = constituent.connective DocID = connective.DocID sent_index = connective.sent_index if (DocID, sent_index) not in dict_clauses: clauses_list = dict_util.get_sent_clauses(parse_dict, DocID, sent_index) dict_clauses[(DocID, sent_index)] = clauses_list clauses_list = dict_clauses[(DocID, sent_index)] #[[1,2],[4,5,6]] #为每个constituent ,判断她是否与前面的一个constituent是否处于同一个clause prev_curr_some_clause = 0 if i > 0: curr_clause_NO = -1 for k, item in enumerate(clauses_list): if set(constituents[i].indices) <= set(item): curr_clause_NO = k break prev_clause_NO = -1 for k, item in enumerate(clauses_list): if set(constituents[i - 1].indices) <= set(item): prev_clause_NO = k break if curr_clause_NO != -1 and prev_clause_NO != -1 and curr_clause_NO == prev_clause_NO: prev_curr_some_clause = 1 return Feature("", 1, {1: prev_curr_some_clause})
def __init__(self, filename=""): # Read file and generate list of features self.filename = filename if filename != "": try: if not os.path.isfile(filename): raise GFF_IOError("Could not open file '" + filename + "': Not a file.") f = open(filename, 'r') except IOError, s: raise GFF_IOError("Could not open file '" + filename + "': " + str(s)) lines = f.readlines() lines = filter(string.strip, lines) features = [] for l in lines: try: f = Feature(l) except FeatureComment: pass except FeatureInputError, s: sys.stderr.write('Error in feature: ' + str(s) + " " + l) else: features.append(f)
def map(self, f): """Return generator of features applied f :param f: function takes a feature as an argument. """ for feature in self: yield Feature(f(feature))
def verbs(relation, parse_dict): #load dict dict_verb_classes = Non_Explicit_dict().dict_verb_classes '''feature''' # 1. the number of pairs of verbs in Arg1 and Arg2 from same verb class Arg1_words = dict_util.get_Arg_Words_List(relation, "Arg1", parse_dict) Arg2_words = dict_util.get_Arg_Words_List(relation, "Arg2", parse_dict) count = 0 for w1, w2 in [(w1.lower(), w2.lower()) for w1 in Arg1_words for w2 in Arg2_words]: if w1 in dict_verb_classes and w2 in dict_verb_classes: c1 = dict_verb_classes[w1] c2 = dict_verb_classes[w2] if set(c1.split("#")) & set(c2.split("#")) != set([]): count += 1 feat_1 = Feature("", 1, {1: count}) #2. POS of main verb Arg1_MV_POS = dict_util.get_main_verb_pos(relation, "Arg1", parse_dict) Arg2_MV_POS = dict_util.get_main_verb_pos(relation, "Arg2", parse_dict) MV_POS_feature_list = Arg1_MV_POS + Arg2_MV_POS MV_POS_feature = get_feature_by_list(MV_POS_feature_list) return util.mergeFeatures([feat_1, MV_POS_feature])
def HarrisCorner(images, isDisplay) : for image in images : features = [] logging.info(f'IMAGE {image.getImageID():02d}:Applying Harris Corners Detection') imageName = image.getImageName() img = cv.imread(imageName) gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY) gray = np.float32(gray) dst = cv.cornerHarris(gray, 2, 3,0.0001) ret, dst = cv.threshold(dst,0.001*dst.max(),255,0) dst = np.uint8(dst) ret, labels, stats, centroids = cv.connectedComponentsWithStats(dst) criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 100, 0.001) corners = cv.cornerSubPix(gray,np.float32(centroids),(5,5),(-1,-1),criteria) for corner in corners : if isDisplay : # img[dst>dst.max()]=[0,0,255] cv.circle(img, (int(corner[0]), int(corner[1])), 4, (0, 0, 255), -1) feature = Feature(corner[0], corner[1], image) features.append(feature) if isDisplay : cv.imshow(f'Image {image.getImageID()}', img) cv.waitKey(0) cv.destroyAllWindows() image.setFeatures(features)
def readGeonamesFile(filename, feature_codes): linecounter = 0 # contains the features from the file features = {} # contains the field names from the header row fields = {} # read the geonames file into dictionary logging.debug("processing geonames file") geonames_file = open(filename, "r") for line in geonames_file.readlines(): linecounter += 1 row = line.split("\t") # assign first row to the header if linecounter == 1: # dictionary of field names and their position - from the header row logging.debug("processing geonames file header row") for i in range(len(row)): # strip newline from the field name (last field) fields.update({row[i].strip(): i}) else: # construct the feature object feature = Feature(fields, row) # lookup the name of corresponding to the feature code and class feature.featureName = feature_codes.get(feature.featureClassCode) features.update({feature.geonameid : feature}) geonames_file.close() return features
def __init__(self, K, middle_state=None): self.K = K self.middle_state = middle_state self.patient_cluster = dict() # pat_id: cluster index self.patient_info = dict() # pat_id: Patient self.patient_id = list() # pat_id self.feature = Feature() # store
def transform_single(self, f): st = feature_summary(f.data) return Feature.merge_instances( f, Feature(f.name, f.data, st) )
def load_feat_dataset(data_dir): labels = os.path.join(data_dir, "labels") label_imgs = get_filelist(labels, ".png") poses = load_pose(os.path.join(data_dir, "poses.txt")) img_name_dict = {} for i in range(len(label_imgs)): img_name = os.path.split(label_imgs[i])[-1][:-4] img_name_dict[img_name] = label_imgs[i] seg_feature_dataset = [] cnt, step = 0, 1 for i in tqdm(range(len(poses))): pose = poses[i] if i == 0: x_last, y_last = pose.data[0], pose.data[1] acc_veh_odm = np.sqrt((pose.data[0] - x_last)**2 + (pose.data[1] - y_last)**2) x_last, y_last = pose.data[0], pose.data[1] if pose.name in img_name_dict: if cnt % step == 0: _img_path = img_name_dict[pose.name] _hist_l, _hist_c, _hist_r = load_label_img(_img_path) _feature = Feature() _feature.parse(pose, _hist_l, _hist_c, _hist_r, acc_veh_odm) seg_feature_dataset.append(_feature) cnt += 1 return seg_feature_dataset
def eval_simple_agents(): """ Run simple baselines on each split. """ for split in ["train", "val_seen", "val_unseen"]: env = R2RBatch( Feature(None, False), False, False, 6, False, "lstm", batch_size=1, splits=[split], tokenizer=None, ) ev = Evaluation([split], encoder_type="lstm") # subgoal=False) for agent_type in ["Stop", "Shortest", "Random"]: outfile = "%s%s_%s_agent.json" % (RESULT_DIR, split, agent_type.lower()) agent = BaseAgent.get_agent(agent_type)(env, outfile) agent.test() agent.write_results() score_summary, _ = ev.score(outfile) print("\n%s" % agent_type) pp.pprint(score_summary)
def clause_word_num(arg_clauses, clause_index, parse_dict): # load dict # feature clause_word_num = len(arg_clauses.clauses[clause_index][0]) return Feature("", 1, {"1": clause_word_num})
def main(args): # Load configuration config = Configuration(args.yaml_path) print("Loading Probase...") probase = Probase(config) print("Loading dataset...") dataset = Data(config) print("Loading NLP utility...") nlp = NLP('en') print("Loading feature extractor...") features = Feature(config, probase, nlp=nlp) print("Extracting vector features") features.extract_vector_features(dataset) print("Extracting statistical vector features") features.extract_statistical_features(dataset) print("Evaluating clasifiers") ev = Evaluation(config, dataset) ev.full_evaluation(features.X, features.y)
def gen_feature_pool_from_array(fm, d): c_id = 0 for f in fm: yield Feature(f.name, d[:, c_id], f.st) c_id += 1 assert c_id == d.shape[1], \ "Result feature pool and given array didn't match: {} != {}".format(c_id, d.shape[1])
def guess_three(self, season, round_num): feature = Feature(4) feature.prepare_team_ranking() with codecs.open('input/yingchao_result.csv', 'rb', 'utf-8') as file: next(file) for row in file: row = row.strip('\r\n') columns = row.split(',') if season == int(columns[0]) and round_num == int( columns[2]) and int(columns[1]) == 4: home_team = Feature.team_name_clear(columns[3]) guest_team = Feature.team_name_clear(columns[4]) home_key = Feature.create_team_id('2017', columns[1], home_team) guest_key = Feature.create_team_id('2017', columns[1], guest_team) home_ranking = feature.get_team_ranking(home_key) guest_ranking = feature.get_team_ranking(guest_key) diff_ranking = home_ranking - guest_ranking print home_team + '\t' + guest_team + '\t' + str( home_ranking) + '\t' + str(guest_ranking) + '\t' + str( diff_ranking)
def __init__(self): self.feature = Feature(None, None, color_space=color_space, spatial_size=spatial_size, hist_bins=hist_bins, orient=orient, pix_per_cell=pix_per_cell, cell_per_block=cell_per_block, hog_channel=hog_channel, spatial_feat=spatial_feat, hist_feat=hist_feat, hog_feat=hog_feat, show_debug=False) self.feature.learn() # scaler to X scaled_X, self.X_scaler = self.feature.get_scaled_X() # Define the labels vector y = self.feature.get_label_vector() # Split up data into randomized training and test sets rand_state = np.random.randint(0, 100) X_train, X_test, y_train, y_test = train_test_split( scaled_X, y, test_size=0.2, random_state=rand_state) print('Using:', orient, 'orientations', pix_per_cell, 'pixels per cell and', cell_per_block, 'cells per block') print('Feature vector length:', len(X_train[0])) # Use a linear SVC self.c = Classifier('SVC') self.c.run(X_train, y_train) print('SVC Accuracy: ', self.c.getAccuracy(X_test, y_test))
def get_feature(self, name): try: redis = Redis(connection_pool=self.redis_pool) return Feature(name,self.__get_feature_percentage(name), redis.smembers(f"feature:{name}:users")) except Exception as e: logger.exception( f"[PyCurtain] Redis error while getting data from feature [{name}]", e)