def test(self, item): revs = Feature.revs(item) if revs["current"] is None: raise Exception("Empty revision!") curr = revs['current'] avg_score = (self.guests["vandal"]/self.guests["total"]) \ if curr["user"]["id"] is None \ else (self.users["vandal"]/self.users["total"]) user_score = 0 if curr["user"]["id"] is None: country_code = self.get_country(curr["user"]["name"]) if country_code not in self.countries or self.countries[ country_code]["vandal"] < 10: user_score = avg_score else: user_score = self.countries[country_code][ "vandal"] / self.countries[country_code]["total"] else: user_score = avg_score hour = curr["timestamp"].hour hour_score = self.hours[hour]["vandal"] / self.hours[hour]["total"] day = curr["timestamp"].weekday() day_score = self.days[day]["vandal"] / self.days[day]["total"] return 1 # hour_score * day_score
def create_feature_instance(self) -> Feature: shape = self.feature_shape() name = self.feature_name() units = self.feature_units() dtype = self.feature_dtype() annotations = self.feature_annotations() return Feature(name, self.recording, self.current_channel.id, \ units=units, datapoint_shape=shape, \ data_type=dtype, annotations=annotations)
def split_features_by_distance(features, distance): """ Split up each geometry in a list of features based on distance Arguments: features {list} -- List of Feature objects distance {int|float} -- Approx distance in metres between splits """ split_features = [] for f in features: split_geoms = split_line_by_distance(f.geom, distance) for sg in split_geoms: split_features.append(Feature(sg, f.data)) return split_features
def snap_features(r, features): """ Geometrically 'snap' (connect) features together which are within radius `r` of each other Arguments: r {int|float} -- Tolerance radius in metres within which to snap lines together features {list} -- list of Features Returns: list -- list of snapped Features """ snapped_geoms = snap_linestrings(r, [f.geom for f in features]) return [Feature(snapped_geoms[i], f.data) for i, f in enumerate(features)]
def raster_to_features(path): """ Convert each pixel in a raster to a Shapely Point located at that pixels centroid, and give it a value attribute equal to the pixels value. Return these as a list of Features. Arguments: path {str} -- Path to raster file Note: TODO potential way to vectorise: # All rows and columns into numpy mesh grid # cols, rows = np.meshgrid(np.arange(A.shape[2]), np.arange(A.shape[1])) # All eastings and northings # lats, lons = np.vectorize(rc2en, otypes=[np.float, np.float])(rows, cols) """ # Read raster with rasterio.open(path) as r: T0 = r.transform # upper-left pixel corner affine transform p1 = pyproj.Proj(r.crs) A = r.read() # pixel values pixelSizeX, pixelSizeY = r.res # Get affine transform for pixel centres T1 = T0 * Affine.translation(0.5, 0.5) # Function to convert pixel row/column index (from 0) to lat/lon at centre def rc2en(r, c): return (c, r) * T1 features = [] it = np.nditer(A, flags=['multi_index']) while not it.finished: value = np.asscalar(it[0]) if value > 0: features.append( Feature( Point(rc2en(it.multi_index[1], it.multi_index[2])), { 'value': np.asscalar(it[0]), # assumes projected CRS 'pixel_size': (pixelSizeX * pixelSizeY) * 1e-6 })) it.iternext() return features
def merge_features(features): """ Merge feature geometries together where possible, forming several contiguous MultiLineStrings. Applies data of first feature to all. Arguments: features {list} -- list of Features """ merged_features = [] merged_geoms = linemerge([f.geom for f in features]) if merged_geoms.geom_type == 'MultiLineString': merged_geoms = merged_geoms.geoms else: merged_geoms = [merged_geoms] for mg in merged_geoms: merged_features.append(Feature(mg, features[0].data)) return merged_features
def train_one(self, raw): revs = Feature.revs(raw) curr = revs["current"] if curr is None: return vandal_int = 1 if raw["vandal"] else 0 if curr["user"]["id"] is None: self._append_ip(curr["user"]["name"], raw["vandal"]) self.guests["total"] += 1 self.guests["vandal"] += vandal_int else: self.users["total"] += 1 self.users["vandal"] += vandal_int self.days[curr["timestamp"].weekday()]["vandal"] += vandal_int self.days[curr["timestamp"].weekday()]["total"] += 1 self.hours[curr["timestamp"].hour]["vandal"] += vandal_int self.hours[curr["timestamp"].hour]["total"] += 1
def load_features(path, data=False): """ From a shapefile, create a list of features with geometry and data loaded from file. If data is specified, data will instead be filled with whatever is provided. Arguments: path {str} -- Path to shapefile to load data {boolean|dict} -- False, or value to fill each feature's data with """ features = [] with fiona.open(path) as source: for f in source: if not f['geometry']: print('Ignoring feature with no geometry...') continue shapely_class = get_shapely_class_from_geom_type( f['geometry']['type']) features.append( Feature(shapely_class(shape(f['geometry'])), data if data else dict(f['properties']))) return features
if any(f not in raw["f"] for f in OK_FEATURES): continue #raw["f"].pop('t_biscore', None) vandal_score = 0.1216 if raw["f"]["lr_guest"] else 0.10 day_score = 1 if raw["revs"][-1]["timestamp"].weekday() <= 4 else 0.97 raw_sec.append([raw["f"]["t_biscore"]*day_score*vandal_score]) raw_chr.append([raw["f"]["t_charscore"]]) tmp = [] for f in OK_F_LIST: tmp.append(raw["f"][f]) # raw_list.append([x for n, x in raw["f"].items() if n in OK_FEATURES]) raw_list.append(tmp) revs = Feature.revs(raw) raw["url"] = "https://ru.wikipedia.org/w/index.php?type=revision&diff={}&oldid={}".format( revs["current"]["id"], revs["prev_user"]["id"] ); if raw["f"]["sb_added"] > 0: print(raw["url"]) del raw["revs"] del raw["rwords"] raw_orig.append(raw) raw_res.append(1 if raw["vandal"] else 0) print(len(raw_list)) sys.exit(0) from sklearn.metrics import confusion_matrix
cnt = Counter(50) for raw in raw_collection.find({}): #if "tmp" in raw and raw["tmp"] is not None: # continue if raw["revs"] is None or len(raw["revs"]) <= 1: #print(raw) this should not happen continue #if "rwords" in raw: # cnt.tick() # continue texts = Feature.revs(raw) if texts['prev_user'] is None or texts['current']['text'] is None: continue if texts['prev_user']['text'] is None: continue cnt.tick() #if cnt.value() < 18000: # continue prev_text = strip_accents(texts['prev_user']['text']) #strip_blockquotes curr_text = strip_accents(texts['current']['text'])
flags2 = DepRepo.flags() TRUSTED_GROUPS = [ 'editor', 'autoeditor', 'rollbacker', 'reviewer', 'sysop', 'bureaucrat' ] users = 0 total = 0 for item in collection.find({ "f.link_avg_new": 0, "vandal": True }, no_cursor_timeout=True): if len(item["revs"]) < 2: continue revs = Feature.revs(item) print(get_url(revs)) import pymorphy2 morph = pymorphy2.MorphAnalyzer() test = morph.parse('') print("http://google.com") sys.exit(0) client = MongoClient('localhost', 27017) raw_list = [] raw_res = []