def insert_dataset(nex_session, fw, x, parent_dataset_id): print "DATASET:", x y = Dataset(format_name=x['format_name'], display_name=x['display_name'], obj_url="/dataset/" + x['format_name'], source_id=x['source_id'], dbxref_id=x.get('dbxref_id'), dbxref_type=x.get('dbxref_type'), date_public=x.get('date_public'), parent_dataset_id=x.get('parent_dataset_id'), assay_id=x.get('assay_id'), channel_count=x.get('channel_count'), sample_count=x.get('sample_count'), is_in_spell=x.get('is_in_spell'), is_in_browser=x.get('is_in_browser'), description=x.get('description'), created_by=CREATED_BY) nex_session.add(y) nex_session.flush() nex_session.refresh(y) fw.write("Insert dataset: " + x['display_name'] + " into database\n") return y.dataset_id
def create_featureset(sessionconfig, params): session = sessionconfig[0] config = sessionconfig[1] modulename = params['modulename'] analysisMod = session.query(Analysis).from_statement(text("SELECT * FROM analysis where name=:name")).\ params(name=modulename).first() if(analysisMod): # Check if the module exists # module_id = analysisMod.id checkDataset = session.query(Dataset).from_statement(text("SELECT * FROM datasets where name=:name")).\ params(name=params['name']).first() if(checkDataset is None): dataset = Dataset(name=params['name'], identifier='', description=params['description'], details=params['details'], module_parameters=params['module_parameters'], created=params['created'], user=params['user'], fileformat="Parquet", filepath=params['filepath'], schema=params['schema'], module_id=analysisMod.id) shutil.copyfile(config['METADATA_LOCAL_PATH'], config['BACKUP_METADATA_LOCAL_PATH']) session.add(dataset) session.commit() else: raise RuntimeError('The feature set with the name ' + params['name'] + ' already exists') else: raise RuntimeError('No Such Module')
def create_dataset(sessionconfig, params): session = sessionconfig[0] config = sessionconfig[1] checkDataset = session.query(Dataset).from_statement(text("SELECT * FROM datasets where name=:name")).\ params(name=params['name']).first() if(checkDataset is None): dataset = Dataset(name=params['name'], identifier=params['identifier'], description=params['description'], details=params['details'], module_parameters='', created=params['created'], user=params['user'], fileformat="Parquet", filepath=params['filepath'], schema=params['schema'], module_id='') shutil.copyfile(config['METADATA_LOCAL_PATH'], config['BACKUP_METADATA_LOCAL_PATH']) session.add(dataset) session.commit() objs = [] if(config['BACKEND'] == 'hdfs'): objs.append((config['MODULES_DIR'] + 'sqlite.db', config['METADATA_LOCAL_PATH'])) elif(config['BACKEND'] == 'swift'): objs.append(('sqlite.db', config['METADATA_LOCAL_PATH'])) elif(config['BACKEND'] == 'nfs'): pass saveObjsBackend(objs, config['BACKEND'], config) else: raise RuntimeError("The dataset with name " + params['name'] + " already exists")
def datasetr(): print(request.method) form = Dataset(request.form) nameMale = request.form["manname"] femalName = request.form["womenname"] language = request.form["language"] time = request.form["time"] min_tweet = request.form["min_number"] (tweets, y) = getTweets2(int(time), nameMale, femalName, language, int(min_tweet)) # tweets = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"] #form.tweet. = 'ada' #y = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"] y2 = np.array(y) f = np.count_nonzero(y2 == 0) h = np.count_nonzero(y2 == 1) detail = [ 'Nombre de tweets: ' + str(len(y)), 'Femme: ' + str(f), 'Homme: ' + str(h) ] #flash(str(request.form)) if request.method == 'POST': return render_template('datasetresult.html', form=form, tweets=tweets, detail=detail)
def save_form_data(form, file_name): """Save the data associated with an uploaded dataset.""" data_dict = dict((field, form[field]) for field in form) # Now delete the unnecessary keys... del data_dict['submit'], data_dict['csrf'] data_dict['file_name'] = file_name dataset = Dataset(**data_dict) db.session.add(dataset) db.session.commit()
def create_dataset(self, user, name, parent=None): if parent: parent = self.session.query(Dataset).filter( Dataset.name == parent).first() path = hashlib.sha256(user.name + name).hexdigest() dataset = Dataset(name=name, owner=user, path=path, parent=[parent]) abspath = os.path.join(self.path, path) os.makedirs(abspath)
def example(): dataset = Dataset() meta, data = dataset.at(0, xy=False) symbol = Symbol(meta, data) symbols = ( symbol['count'].apply(np.log) == symbol[['age', 'smoke']].interact(lambda x: x[0]*x[1], name='age_smoke') + symbol['age'] + symbol['smoke'] + symbol['drug'] + symbol['partners'] + symbol['cesd'] ) return symbols
def savecsv(): print(request.method) form = Dataset(request.form) if 'tweets' in request.form: tweets = request.form['tweets'] tweetsR = eval(tweets) #return str(tweetsR[0]) #name = asksaveasfilename() if request.form['typef'] == 'json': root = Tk() root.filename = filedialog.asksaveasfilename( initialdir="/", title="Destination du Json", filetypes=(("JSON", "*.json"), ("all files", "*.*"))) #root.mainloop() root.destroy() with open(root.filename + '.json', 'w') as out_f: json.dump(tweetsR, out_f) out_f.close() else: root = Tk() root.filename = filedialog.asksaveasfilename( initialdir="/", title="Destination du dataset", filetypes=(("xlsx", "*.xlsx"), ("all files", "*.*"))) #root.mainloop() root.destroy() workbook = xlsxwriter.Workbook(root.filename + '.xlsx') worksheet = workbook.add_worksheet() rowEx = 1 worksheet.write(0, 0, 'text') worksheet.write(0, 1, 'gender') for tweet in tweetsR: worksheet.write(rowEx, 0, tweet[0]) worksheet.write(rowEx, 1, tweet[1]) rowEx += 1 workbook.close() #with open(name + '.csv', 'w', newline='') as csvfile: # create = csv.writer(csvfile) # create.writerow(tweets) # tweets = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"] #form.tweet. = 'ada' #y = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"] #flash(str(request.form)) if request.method == 'POST': return render_template('datasetresult.html', form=form, tweets=tweetsR)
def init_db(): # import all modules here that might define models so that # they will be registered properly on the metadata. Otherwise # you will have to import them first before calling init_db() from models import Department, User, Role, Dataset Base.metadata.drop_all(bind=engine) Base.metadata.create_all(bind=engine) # Create the fixtures engineering = Department(name='Engineering') db_session.add(engineering) hr = Department(name='Human Resources') db_session.add(hr) manager = Role(name='manager') db_session.add(manager) engineer = Role(name='engineer') db_session.add(engineer) peter = User(name='Peter', department=engineering, role=engineer) db_session.add(peter) roy = User(name='Roy', department=engineering, role=engineer) db_session.add(roy) tracy = User(name='Tracy', department=hr, role=manager) db_session.add(tracy) # Dataset import random from random import randint from faker import Faker fake = Faker('en_US') nPoints = 11 # data = {'x': [randint(0, 1000) for i in range(nPoints)], 'z': [float(random.randrange(0, 1000))/100 for i in range(nPoints)], 'names': [fake.name() for i in range(nPoints)] } data = { 'x': [int(i) for i in range(nPoints)], 'z': [float(i) for i in range(nPoints)], 'names': [fake.name() for i in range(nPoints)] } test_data1 = Dataset(name='dataset1', description='First dataset', table_name='data1', enabled=True, raw=data) db_session.add(test_data1) # data = {'x': [randint(0, 1000) for i in range(nPoints)], 'z': [float(random.randrange(0, 1000))/100 for i in range(nPoints)], 'names': [fake.name() for i in range(nPoints)] } # test_data2 = Dataset(name='dataset2', description='Second dataset', table_name='data2', enabled=False, raw=data) # db_session.add(test_data2) db_session.commit()
def get_dataset(datastore, id): ''' Creates a dataset object from the .valid file ''' try: valid_path = '{0}/uploads/.valid'.format(id) valid_file = datastore.read(valid_path) except AttributeError: return None if valid_file.read() == id: dataset = Dataset(id) dataset.datastore = datastore return dataset
def dataset(): print(request.method) form = Dataset(request.form) #root = Tk() #root.filename = filedialog.asksaveasfilename(initialdir = "/",title = "Select file",filetypes = (("jpeg files","*.jpg"),("all files","*.*"))) #name = asksaveasfilename() #with open(name + '.csv', 'w', newline='') as csvfile: # create = csv.writer(csvfile) # create.writerow(["adam","deboosere"]) if not form.validate_on_submit(): return render_template('dataset.html', form=form) if request.method == 'POST': #return 'Submitted!' render_template('dataset.html', form=form)
def new_dataset(): ''' Create a unique url for this dataset to work under Create a folder on S3 using this url ''' # Make a new dataset object id = str(uuid.uuid4()) dataset = Dataset(id) dataset.datastore = make_datastore(app.config['DATASTORE']) # Write a verifying file to prove we created these folders validname = '{0}/uploads/.valid'.format(dataset.id) dataset.datastore.write(validname, StringIO(dataset.id)) return redirect('/datasets/' + dataset.id)
def initvalidateddataset(): lines = list(open(data_validated_csv).readlines()) for line in lines: audio_path,lenght,text = line.split(',') text = text.replace('\n','') new_data= Dataset() new_data.text = text new_data.audio_lenght = lenght new_data.file_path= audio_path new_data.file_with_user = 0 # 1 if user validating this instance new_data.instance_validated = 1 #1 if human validated this instance new_data.instance_valid = 1 # 1 if instance is ok new_data.user_validated = 'edresson' db.session.add(new_data) db.session.commit()
def add_dataset(): body = request.get_json() name = body.get('name') type = body.get('type') description = body.get('description') provider_id = body.get('provider_id') try: dataset = Dataset(name, provider_id, type, description) dataset.insert() return jsonify({'success': True, 'dataset_id': dataset.id}) except Exception as es: print(es) abort(422)
def example_1_2_6(): # 数据预处理 meta, data= Dataset().at(4, xy=False) mapper = collections.Counter(data['id']) for id in mapper: index = data['id'] == id data.loc[index, 'weight'] = data.loc[index, 'weight'].mean() # 线性模型的符号表示 symbol = Symbol(meta, data) symbols = ( symbol['weight'] == symbol['gender'] + symbol['dose'] + symbol['id'].apply(lambda xs: [mapper[x] for x in xs], 'size') ) return symbols
def diff_datasets(dataset_a=None, dataset_b=None): """ Compara dos datasets y retorna la diferencia aditiva de ambos. Cuando se realiza la diferencia, el valor que prevalece es el de dataset_b. Args: ==== - dataset_a: - Dataset(). - Solo admite ser de tipo Dataset(). - dataset_b: - Dataset(). - Solo admite ser de tipo Dataset(). Returns: ======= - Dataset(). Exceptions: ========== TypeError: - Uno o ambos argumentos, no son de clase Dataset. """ from models import Dataset for v in [dataset_a, dataset_b]: if not isinstance(v, Dataset): raise TypeError( 'Para comparar los datasets ambos deben ser de clase Dataset.' ) diff_ds = {} omit_this_keys = ['required_keys', 'context'] for k, v in dataset_a.__dict__.items(): if k not in omit_this_keys: if v != dataset_b.__dict__[k]: diff_ds.update({ k: dataset_b.__dict__[k] if len(dataset_b.__dict__[k]) > 0 else v }) else: diff_ds.update({k: v}) return Dataset(datadict=diff_ds, _distributions=dataset_a.__dict__['resources'], _distribution_literal=True)
def freeze_dataset(self, id_or_name): """ Crea una imagen temporal del contenido de un dataset. Args: ==== - id_or_name: - str(). - Id o Nombre del dataset que deseo freezar. Returns: ======= - Dataset: Si el objeto es localizable & "Freezable". Exceptions: ========== - ValueError: - id_or_name esta unicode o str pero es del len == 0. - TypeError: - id_or_name no es un str o unicode. """ from models import Dataset stored_dataset = self.retrieve_dataset_metadata(id_or_name) if stored_dataset: freezed_dataset = { "license_title": stored_dataset['license_title'], "maintainer": stored_dataset['maintainer'], "private": stored_dataset['private'], "maintainer_email": stored_dataset['maintainer_email'], "id": stored_dataset['id'], "owner_org": stored_dataset['owner_org'], "author": stored_dataset['author'], "isopen": stored_dataset['isopen'], "author_email": stored_dataset['author_email'], "state": stored_dataset['state'], "license_id": stored_dataset['license_id'], "type": stored_dataset['type'], "groups": [g['name'] for g in stored_dataset['groups']], "creator_user_id": stored_dataset['creator_user_id'], "name": stored_dataset['name'], "url": stored_dataset['url'], "notes": stored_dataset['notes'], "title": stored_dataset['title'], "license_url": stored_dataset['license_url'] } return Dataset(datadict=freezed_dataset, _distribution_literal=True, _distributions=stored_dataset['resources'])
def add_testdata_to_db(dataset, items, datatype): count = db.session.query( Dataset, Dataset.name).filter(Dataset.name == dataset).all() if len(count) > 0: return 'exists' new_dataset = Dataset(name=dataset, datatype=datatype) for item in items: testitem = Item( dataset_name=dataset, item=json.dumps(item), status='available', timestamp=datetime.now(), ) new_dataset.items.append(testitem) db.session.add(testitem) db.session.add(new_dataset) db.session.commit() return 'added'
def datasets(): if not current_user.is_authenticated: return redirect(url_for('no_access')) datasets = Dataset.query.filter(Dataset.user_id == current_user.id).all() for ds in datasets: ds.distinctive_name = ds.distinctive_name or ds.filename if ds.distinctive_name == ds.filename: ds.display_filename = '' else: ds.display_filename = ds.filename model = { 'title': 'Datasets', 'datasets': datasets } form = FileUploadForm() if form.validate_on_submit(): dsFile = form.fileName.data separator = form.separator.data distinctive_name = form.distinctive_name.data filename = secure_filename(dsFile.filename) guid = str(uuid.uuid4()) dsFile.seek(0) dt = dsFile.read() dbDs = Dataset(filename, guid, g.user, datetime.datetime.utcnow(), separator, distinctive_name, dt) db.session.add(dbDs) db.session.commit() return redirect(url_for('datasets')) model['form'] = form return render_template('datasets.html', model = model)
def post(self): urlfetch.set_default_fetch_deadline(60) self.response.headers['Content-Type'] = 'application/json' q = "select gbifdatasetid, icode, orgname, github_orgname, " \ "source_url, github_reponame, url, gbifpublisherid " \ "from resource_staging " \ "where ipt=true and networks like '%VertNet%'" resources = carto_query(q) ds = [] for resource in resources: ds.append(Dataset(id=resource['gbifdatasetid'], **resource)) keys = ndb.put_multi(ds) result = { "datasets processed": len(keys), "message": "success" } self.response.write(json.dumps(result)) return
def dataset_upload(): form = DatasetForm() if form.validate_on_submit(): upload = form.file.data name, ext = os.path.splitext(upload.filename) acceptable = ['.jpg', '.jpeg', '.png'] label_acceptable = ['.csv'] def unarchive_blob(item, dset, tmpd, archive): archive.extract(item, tmpd) # TODO: change to check if path contains valid image blob = Blob(os.path.join(str(tmpd), item.filename)) dset.blobs.append(blob) return def list_blob(url): _, ext = os.path.splitext(url) if ext in acceptable: # TODO: change to check if url contains valid image blob = Blob(url) dset.blobs.append(blob) return def keyword_dataset(kw, item, dset, tmpd, archive): archive.extract(item, tmpd) kw_fname = str(tmpd) + "/" + item.filename k = Keyword(name=kw[1:], defn_file=kw_fname, dataset=dset) return dset = None if ext == ".zip": with zipfile.ZipFile(upload, 'r') as myzip: tmpd = tempfile.mkdtemp(dir=config.DATASET_DIR, prefix="dataset") dset = Dataset(name=name) db.session.add(dset) for item in myzip.infolist(): fname, ext = os.path.splitext(item.filename) if "__MACOSX" in item.filename: continue kw = os.path.basename(fname) if ext in acceptable: unarchive_blob(item, dset, tmpd, myzip) elif ext in label_acceptable and kw.startswith('_'): print "creating keyword: " + kw keyword_dataset(kw, item, dset, tmpd, myzip) elif ext == ".txt": myzip.extract(item, tmpd) with open(os.path.join(str(tmpd), item.filename)) as img_list: for url in img_list: url = url.rstrip() list_blob(url) elif ext == ".csv" and not kw.startswith('_'): myzip.extract(item, tmpd) with open(os.path.join(str(tmpd), item.filename)) as img_list: for row in csv.reader(img_list): for entry in row: url = as_url(entry) if url: list_blob(url) elif ext == ".gz" or ext == ".bz2" or ext == ".tar": if ext != ".tar": name, ext = os.path.splitext(name) if ext == ".tar": with tarfile.open(fileobj=upload) as mytar: tmpd = tempfile.mkdtemp(dir=config.DATASET_DIR, prefix="dataset") dset = Dataset(name=name) db.session.add(dset) for item in mytar: if item.isreg(): fname, ext = os.path.splitext(item.filename) if "__MACOSX" in item.filename: continue kw = os.path.basename(fname) if ext in acceptable: unarchive_blob(item, dset, tmpd, mytar) if ext in label_acceptable and kw.startswith('_'): keyword_dataset(kw, item, dset, tmpd, mytar) elif ext == ".txt": mytar.extract(item, tmpd) with open( os.path.join( str(tmpd), item.filename)) as img_list: for url in img_list: url = url.rstrip() list_blob(url) elif ext == ".csv" and not kw.startswith('_'): mytar.extract(item, tmpd) with open( os.path.join( str(tmpd), item.filename)) as img_list: for row in csv.reader(img_list): for entry in row: url = as_url(entry) if url: list_blob(url) elif ext == ".txt": dset = Dataset(name=name) db.session.add(dset) for url in upload: url = url.rstrip() list_blob(url) elif ext == ".csv": dset = Dataset(name=name) db.session.add(dset) for row in csv.reader(upload): for entry in row: url = as_url(entry) if url: list_blob(url) if dset != None: if form.patchspec.data: dset.patchspecs.append(form.patchspec.data) if form.featurespec.data: dset.featurespecs.append(form.featurespec.data) db.session.commit() tasks.dataset.delay(dset.id) return jsonify(name=dset.name, id=dset.id, url=dset.url) else: print form.errors return jsonify(errors=form.file.errors)
def dataset_upload(request): user = request.user if request.method == 'POST': if user.is_authenticated(): file = request.FILES.get('filename', '') file_name = file.name dest_dir = os.path.join(settings.USR_DATASET_ROOT, user.username) if not os.path.exists(dest_dir): os.makedirs(dest_dir) full_path = os.path.join(dest_dir, file_name) rel_path = os.path.join(user.username, file_name) destination = open(full_path, "wb+") for chunk in file.chunks(): destination.write(chunk) destination.close() description = request.POST['description'] access = request.POST['access'] tbl_separator = { "tab": '\t', "space": ' ', "comma": ',', "semicolon": ';' } sep_str = request.POST['sep'] sep = tbl_separator[sep_str] header = request.POST['header'] if header == 'yes': header = True elif header == 'no': header = False ## a simple check size = 0 for line in open(full_path): size += 1 dim = len(line.split(sep)) if header == True: size -= 1 # exclude the header line new_dataset = Dataset(owner=user, path=rel_path, name=file_name, dim=dim, size=size, description=description, access=access, sep=sep_str, header=header) new_dataset.save() notice = "Congratulations! Your dataset has been successfully uploaded." # return render_to_response('dataset/success.html', RequestContext(request, {'dataset': new_dataset, 'notice': notice})) return HttpResponseRedirect('/datasets/%s/' % new_dataset.id) else: notice = "You must be logged in to upload datasets" form = UploadDatasetForm() return render_to_response( 'dataset/upload.html', RequestContext(request, { 'form': form, 'notice': notice })) else: form = UploadDatasetForm() return render_to_response('dataset/upload.html', RequestContext(request, {'form': form}))
"url": 'http://181.209.63.71/dataset/6897d435-8084-4685-b8ce-304b190755e4/resource/6145bf1c-a2fb-4bb5-b090-bb25f8419198/download/estructura-organica-3.csv', "name": 'Test At: n{}hrs.'.format(arrow.now().format('HH:mm'))} dataset = {"license_title": "Creative Commons Attribution", "maintainer": "Jose A. Salgado(M)", "private": False, "maintainer_email": "*****@*****.**", "id": "", "owner_org": "99920e14-6146-4cd1-8e57-d9d8c3b3190b", "author": "Jose A. Salgado", "author_email": "*****@*****.**", "state": "active", "license_id": "cc-by", "type": "dataset", "groups": [], "name": "", "isopen": True, "url": "", "notes": "Dataset de prueba para testear la colocacion de puntos sobre un mapa de la IGN", "title": "Rocket Science", "license_url": "http://www.opendefinition.org/licenses/cc-by"} d = Distribution(datadict=dist) my_dataset = Dataset(datadict=dataset, _distributions=d) if cu.save(my_dataset, only_metadata=True, _views=True): print "Dataset salvado con exito!!" else: print "Oops... algo se rompio..."
file_dataset = "df_σ02_350_08Х18Н10Т.json" target_mech = "σ0,2_350" norm_mech = "σ0,2_350_norm" target = "is_defect" with open(file_dataset, 'r') as f: df = pd.DataFrame(json.loads(f.read())) print("Dataset: read is done!") output = defaultdict(list) for thr in tqdm.tqdm([1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09, 1.1], desc="Thr"): df_train = df.assign(is_defect=lambda row: (row[target_mech] - thr * row[norm_mech] < 0).astype(int)).drop([target_mech, norm_mech], axis=1) share = df_train[target].mean() d = Dataset(data=json.dumps(df_train.select_dtypes(np.number).to_dict('records')), features=df_train.select_dtypes(np.number).drop(target, axis=1).columns, target=target) m = MlModel(model_type='RandomForestClassifier') search_space = OptParams(model_type=type(m.get_model()).__name__) opt = Opt(data=d, params=search_space, pipeline=m, metric=partial(precision_score, zero_division=0), trials=Trials() ) opt.start_opt() output['thr'] += [thr] output['share'] += [share] output['best_trial'] += [opt.trials.best_trial['result']]
import textwrap from models import Dataset dataset = Dataset() attribute_mapper = { 'balance': '平衡数据', 'response_type': '响应变量的数据类型', } xyz_mapper = { 't': '时间变量', 'x': '协变量', 'y': '响应变量', 'z': '区分个体变量', } for meta, data in dataset: if len(data.columns) > 8: continue print('\\subsubsection{{{}}}'.format(meta['title'])) print(meta['description'] + '\n') print('\\begin{itemize}') print(' \\item 表头说明:') print(' \\begin{enumerate*}[label=(\\alph*), itemjoin={;}]') for x, y in meta['header'].items(): x = x.replace('_', '\_') print(f' \\item {x},{y}') print(' \\end{enumerate*}。' + '\n')
def hazardous_waste(year=2011, verbose=True): try: dataset = Dataset.objects.get(name="Hazardous Waste Sites "+str(year)) dataset.cached = datetime.utcnow().replace(tzinfo=utc) except ObjectDoesNotExist: coor = GeoCoordinates(lat_field="Latitude", lon_field="Longitude") coor.save() names = DatasetNameField(field1_en="Generator Status", field1_name="Generator Status", field2_en="Biennial Report Link", field2_name="Biennial Report Link") names.save() location = Location(street_field="Address", city_field="City", state_field="State", zipcode_field="ZIP Code", county_field="County") dataset = Dataset( name="Hazardous Waste Sites "+str(year), url='/data/ej/'+str(year)+'/', cached=datetime.utcnow().replace(tzinfo=utc), cache_max_age=1000, remote_id_field="Handler ID", name_field="Handler Name", location=location, coordinates=coor, names=names needs_geocoding=False) dataset.save() MapPoint.objects.filter(dataset=dataset).delete() for state in ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']: short_name = 'Envirofacts_Biennial_Report_Search ' + state + '.CSV' path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/ej/'+str(year)+'/'+short_name)) if not os.path.isfile(path): if verbose: print 'No file %s exists.' % (short_name) short_name = str(year)+' '+state+'.CSV' path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/ej/'+str(year)+'/'+short_name)) if not os.path.isfile(path): if verbose: print 'No file %s exists.' % (short_name) continue if verbose: print 'Opening file %s' % (short_name) readfile = csv.reader(open(path, 'rb')) # verify row = readfile.next() locs = {} for i in range(len(row)): if row[i] == dataset.remote_id_field: locs['remote_id'] = i elif row[i] == dataset.name_field: locs['name'] = i elif row[i] == dataset.location.street_field: locs['street'] = i elif row[i] == dataset.location.city_field: locs['city'] = i elif row[i] == dataset.location.state_field: locs['state'] = i elif row[i] == dataset.location.zipcode_field: locs['zipcode'] = i elif row[i] == dataset.location.county_field: locs['county'] = i elif row[i] == dataset.coordinates.lat_field: locs['lat'] = i elif row[i] == dataset.coordinates.lon_field: locs['lon'] = i elif row[i] == dataset.names.field1_name: locs['field1'] = i elif row[i] == dataset.names.field2_name: locs['field2'] = i for row in readfile: kwargs = {'dataset': dataset} for key in locs: if key in ['lat', 'lon']: try: kwargs[key] = float(row[locs[key]]) except Exception: kwargs[key] = 0. elif MapPoint._meta.get_field(key).max_length < len(row[locs[key]]): kwargs[key] = row[locs[key]][:MapPoint._meta.get_field(key).max_length] else: kwargs[key] = row[locs[key]] try: kwargs['point'] = Point(kwargs['lon'], kwargs['lat']) except Exception: if verbose: print '\tInvalid lat/long for row: %s' % (row) print '\tLat: %f Lon: %f' % (kwargs['lat'], kwargs['lon']) continue mp = MapPoint(**kwargs) mp.save() if verbose: print 'File "%s" done processing' % (short_name)
def run(verbose=True, year=2010, starting_state=1): yn = '' # https://docs.djangoproject.com/en/1.7/ref/contrib/gis/layermapping/ while DEBUG and yn != 'y': yn = raw_input('This process can be memory-intensive if' 'DEBUG = True in settings as this logs all SQL. ' 'DEBUG is currently True. Please set this to False' 'if you are experiencing issues. Continue (y/n)?') \ .lower().strip() if yn == 'n': return dataset_qs = Dataset.objects.filter(name__exact=str(year)+' Census Tracts') if len(dataset_qs) > 0: ds = dataset_qs[0] ds.cached = datetime.utcnow().replace(tzinfo=utc), else: coor = GeoCoordinates(lat_field='INTPTLAT'+str(year)[-2:], lon_field='INTPTLON'+str(year)[-2:]) coor.save() names = DatasetNameField(field1_en='Land Area', field1_name='ALAND'+str(year)[-2:], field2_en='Water Area', field2_name='AWATER'+str(year)[-2:]) names.save() ds = Dataset(name=str(year)+' Census Tracts', cached=datetime.utcnow().replace(tzinfo=utc), cache_max_age=1000, name_field='NAMELSAD'+str(year)[-2:], coordinates=coor, names=names) if year == 2010: ds.remote_id_field = 'GEOID00' elif year == 2000: ds.remote_id_field = 'CTIDFP00' ds.save() tract_mapping = { 'remote_id': ds.remote_id_field, 'name': ds.name_field, 'lat': ds.coordinates.lat_field, 'lon': ds.coordinates.lon_field, 'field1': ds.names.field1_name, 'field2': ds.names.field2_name, 'mpoly': 'MULTIPOLYGON', } ftp = ftplib.FTP('ftp2.census.gov') ftp.login() ftp.cwd("/geo/tiger/TIGER2010/TRACT/" + str(year) + "/") files = ftp.nlst() MapPolygon.objects.filter(dataset_id__isnull=True).delete() max_state = MapPolygon.objects.filter(dataset_id__exact=ds.id).aggregate(Max('remote_id')) max_state = max_state['remote_id__max'] if max_state is not None: try: max_state = int(max_state)/1000000000 if max_state >= starting_state: starting_state = max_state + 1 except Exception: pass for i in [format(x, '#02d') for x in range(starting_state, 100)]: short_name = 'tl_2010_' + i + '_tract' + str(year)[-2:] tract_shp = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/'+short_name)) if (not os.path.isfile(tract_shp+'.shp') or not os.path.isfile(tract_shp+'.shx') or not os.path.isfile(tract_shp+'.shp.xml') or not os.path.isfile(tract_shp+'.prj') or not os.path.isfile(tract_shp+'.dbf')): if short_name + '.zip' not in files: continue if verbose: print short_name + '.shp does not exist locally.\n\tDownloading from Census FTP...' try: # download the file local_file = open(tract_shp+'.zip', 'wb') ftp.retrbinary('RETR '+short_name+'.zip', local_file.write) local_file.close() # open the zip zipped = zipfile.ZipFile(tract_shp+'.zip') for suffix in ['.shp', '.prj', '.dbf', '.shp.xml', '.shx']: zipped.extract(short_name+suffix, os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))) except Exception as inst: if verbose: print '\tException:', inst print '\t'+short_name + '.shp did not download or unzip correctly. Moving on...' continue tract_shp = tract_shp + '.shp' if verbose: print '\tBegin layer mapping...' lm = LayerMapping(MapPolygon, tract_shp, tract_mapping, transform=False, encoding='iso-8859-1') while True: try: lm.save(strict=True, verbose=False) # verbose) break # exception part is untested, error didn't happen again except Exception as inst: yn = '' while yn not in ['n', 'y']: yn = raw_input('Error saving: ' + str(inst) + '\nContinue (y/n)?').strip().lower() if yn == 'y': MapPolygon.objects.filter(dataset_id__isnull=True).filter(remote_id__startswith=i).delete() else: break if verbose: print '\tLayer mapping done.' MapPolygon.objects.filter(dataset=None).update(dataset=ds) if verbose: print '\tLayer associated with dataset.' ftp.quit() if verbose: print 'All shapefiles added.'
def upload(): #try: file = request.files['file'] filename = file.filename if filename == '': raise ValueError('No file uploaded!!') file_uploads_path = os.path.join(config.UPLOADS_DIR, filename) file_static_path = os.path.join(config.STATIC_DIR, 'output') file_static_path = os.path.join(file_static_path, filename) file.save(file_uploads_path) cleaned_file = '' if (filename.rsplit('.', 1)[1].lower() == 'csv'): dirty_file = pd.read_csv(file_uploads_path, sep=',') res = data_cleaning.id_classLabel_check(dirty_file) if (res != True): raise ValueError(res) missing_val_fixed_file = data_cleaning.fix_missing( dirty_file, request.form['fix']) cleaned_file = data_cleaning.clean(missing_val_fixed_file) cleaned_file.to_csv(file_uploads_path, sep=',', index=False) elif (filename.rsplit('.', 1)[1].lower() == 'tsv'): dirty_file = pd.read_csv(file_uploads_path, sep='\t') missing_val_fixed_file = data_cleaning.fix_missing( dirty_file, request.form['fix']) cleaned_file = data_cleaning.clean(missing_val_fixed_file) cleaned_file.to_csv(file_uploads_path, sep=',', index=False) elif (filename.rsplit('.', 1)[1].lower() == 'json'): print(str(file_uploads_path)) dirty_file = pd.read_json(str(file_uploads_path)) missing_val_fixed_file = data_cleaning.fix_missing( dirty_file, request.form['fix']) cleaned_file = data_cleaning.clean(missing_val_fixed_file) cleaned_file.to_json(file_uploads_path) else: raise ValueError( 'Invalid file input! Please check the input file type') #cleaned_file =cleaned_file.sort_values(by=['classLabel']) #cleaned_file.to_csv('static/test2.csv') download_path = 'static/uploads/' + filename session['filename'] = filename #X = Dataset.query.filter_by(name = filename) #for i in X: # print(cPickle.loads(i.content)) #SAVING INPUT DATASET TO DATABASE try: #print(cleaned_file) serialized_content = cPickle.dumps(cleaned_file) #session['cleaned_file'] = serialized_content existingDataset = Dataset.query.filter_by(name=filename).all() for data in existingDataset: db.session.delete(data) dataset = Dataset(filename, download_path, serialized_content) db.session.add(dataset) db.session.commit() except Exception as e: print(e) #raise ValueError('Dataset with this name already exist in database!. Please update dataset name') del cleaned_file['classLabel'] cleaned_file.index = cleaned_file['id'] del cleaned_file['id'] #paramObj = heidi_api.getAllSubspaces(cleaned_file, filename) #return render_template('success.html', download_path=download_path, user=current_user) #return render_template('dimension_new.html', title = 'visual tool', user = current_user, paramObj = paramObj) #title='dimension Visualization',datasetName=datasetName,user=current_user, dimensions=['a','b','c']) return redirect(url_for('heidi_controllers.interactive_heidi'))