예제 #1
0
def insert_dataset(nex_session, fw, x, parent_dataset_id):

    print "DATASET:", x

    y = Dataset(format_name=x['format_name'],
                display_name=x['display_name'],
                obj_url="/dataset/" + x['format_name'],
                source_id=x['source_id'],
                dbxref_id=x.get('dbxref_id'),
                dbxref_type=x.get('dbxref_type'),
                date_public=x.get('date_public'),
                parent_dataset_id=x.get('parent_dataset_id'),
                assay_id=x.get('assay_id'),
                channel_count=x.get('channel_count'),
                sample_count=x.get('sample_count'),
                is_in_spell=x.get('is_in_spell'),
                is_in_browser=x.get('is_in_browser'),
                description=x.get('description'),
                created_by=CREATED_BY)

    nex_session.add(y)
    nex_session.flush()
    nex_session.refresh(y)

    fw.write("Insert dataset: " + x['display_name'] + " into database\n")

    return y.dataset_id
예제 #2
0
def create_featureset(sessionconfig, params):

    session = sessionconfig[0]
    config = sessionconfig[1]

    modulename = params['modulename']
    analysisMod = session.query(Analysis).from_statement(text("SELECT * FROM analysis where name=:name")).\
        params(name=modulename).first()

    if(analysisMod):  # Check if the module exists

        # module_id = analysisMod.id
        checkDataset = session.query(Dataset).from_statement(text("SELECT * FROM datasets where name=:name")).\
            params(name=params['name']).first()

        if(checkDataset is None):
            dataset = Dataset(name=params['name'], identifier='', description=params['description'], details=params['details'], module_parameters=params['module_parameters'], created=params['created'], user=params['user'], fileformat="Parquet", filepath=params['filepath'], schema=params['schema'], module_id=analysisMod.id)
            shutil.copyfile(config['METADATA_LOCAL_PATH'], config['BACKUP_METADATA_LOCAL_PATH'])

            session.add(dataset)
            session.commit()

        else:
            raise RuntimeError('The feature set with the name ' + params['name'] + ' already exists')
    else:
        raise RuntimeError('No Such Module')
예제 #3
0
def create_dataset(sessionconfig, params):

    session = sessionconfig[0]
    config = sessionconfig[1]

    checkDataset = session.query(Dataset).from_statement(text("SELECT * FROM datasets where name=:name")).\
        params(name=params['name']).first()

    if(checkDataset is None):

        dataset = Dataset(name=params['name'], identifier=params['identifier'], description=params['description'], details=params['details'], module_parameters='', created=params['created'], user=params['user'], fileformat="Parquet", filepath=params['filepath'], schema=params['schema'], module_id='')
        shutil.copyfile(config['METADATA_LOCAL_PATH'], config['BACKUP_METADATA_LOCAL_PATH'])

        session.add(dataset)
        session.commit()

        objs = []
        if(config['BACKEND'] == 'hdfs'):
            objs.append((config['MODULES_DIR'] + 'sqlite.db', config['METADATA_LOCAL_PATH']))
        elif(config['BACKEND'] == 'swift'):
            objs.append(('sqlite.db', config['METADATA_LOCAL_PATH']))
        elif(config['BACKEND'] == 'nfs'):
            pass

        saveObjsBackend(objs, config['BACKEND'], config)

    else:
        raise RuntimeError("The dataset with name " + params['name'] + " already exists")
예제 #4
0
def datasetr():
    print(request.method)

    form = Dataset(request.form)
    nameMale = request.form["manname"]
    femalName = request.form["womenname"]
    language = request.form["language"]
    time = request.form["time"]
    min_tweet = request.form["min_number"]
    (tweets, y) = getTweets2(int(time), nameMale, femalName, language,
                             int(min_tweet))
    # tweets = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"]
    #form.tweet. = 'ada'
    #y = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"]
    y2 = np.array(y)
    f = np.count_nonzero(y2 == 0)
    h = np.count_nonzero(y2 == 1)
    detail = [
        'Nombre de tweets: ' + str(len(y)), 'Femme: ' + str(f),
        'Homme: ' + str(h)
    ]
    #flash(str(request.form))
    if request.method == 'POST':
        return render_template('datasetresult.html',
                               form=form,
                               tweets=tweets,
                               detail=detail)
예제 #5
0
def save_form_data(form, file_name):
    """Save the data associated with an uploaded dataset."""
    data_dict = dict((field, form[field]) for field in form)
    # Now delete the unnecessary keys...
    del data_dict['submit'], data_dict['csrf']
    data_dict['file_name'] = file_name
    dataset = Dataset(**data_dict)
    db.session.add(dataset)
    db.session.commit()
예제 #6
0
    def create_dataset(self, user, name, parent=None):
        if parent:
            parent = self.session.query(Dataset).filter(
                Dataset.name == parent).first()

        path = hashlib.sha256(user.name + name).hexdigest()
        dataset = Dataset(name=name, owner=user, path=path, parent=[parent])
        abspath = os.path.join(self.path, path)
        os.makedirs(abspath)
예제 #7
0
def example():
    dataset = Dataset()

    meta, data = dataset.at(0, xy=False)
    symbol = Symbol(meta, data)
    symbols = (
        symbol['count'].apply(np.log)
        ==
        symbol[['age', 'smoke']].interact(lambda x: x[0]*x[1], name='age_smoke') +
        symbol['age'] + symbol['smoke'] + symbol['drug'] + symbol['partners'] + symbol['cesd']
    )
    return symbols
예제 #8
0
def savecsv():
    print(request.method)

    form = Dataset(request.form)

    if 'tweets' in request.form:
        tweets = request.form['tweets']
        tweetsR = eval(tweets)
        #return str(tweetsR[0])
        #name = asksaveasfilename()

        if request.form['typef'] == 'json':
            root = Tk()
            root.filename = filedialog.asksaveasfilename(
                initialdir="/",
                title="Destination du Json",
                filetypes=(("JSON", "*.json"), ("all files", "*.*")))
            #root.mainloop()
            root.destroy()
            with open(root.filename + '.json', 'w') as out_f:
                json.dump(tweetsR, out_f)
            out_f.close()
        else:
            root = Tk()
            root.filename = filedialog.asksaveasfilename(
                initialdir="/",
                title="Destination du dataset",
                filetypes=(("xlsx", "*.xlsx"), ("all files", "*.*")))
            #root.mainloop()
            root.destroy()
            workbook = xlsxwriter.Workbook(root.filename + '.xlsx')
            worksheet = workbook.add_worksheet()

            rowEx = 1
            worksheet.write(0, 0, 'text')
            worksheet.write(0, 1, 'gender')
            for tweet in tweetsR:
                worksheet.write(rowEx, 0, tweet[0])
                worksheet.write(rowEx, 1, tweet[1])
                rowEx += 1

            workbook.close()
        #with open(name + '.csv', 'w', newline='') as csvfile:
        #   create = csv.writer(csvfile)
        #  create.writerow(tweets)

# tweets = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"]
#form.tweet. = 'ada'
#y = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"]

#flash(str(request.form))
    if request.method == 'POST':
        return render_template('datasetresult.html', form=form, tweets=tweetsR)
예제 #9
0
def init_db():
    # import all modules here that might define models so that
    # they will be registered properly on the metadata.  Otherwise
    # you will have to import them first before calling init_db()
    from models import Department, User, Role, Dataset
    Base.metadata.drop_all(bind=engine)
    Base.metadata.create_all(bind=engine)

    # Create the fixtures
    engineering = Department(name='Engineering')
    db_session.add(engineering)
    hr = Department(name='Human Resources')
    db_session.add(hr)

    manager = Role(name='manager')
    db_session.add(manager)
    engineer = Role(name='engineer')
    db_session.add(engineer)

    peter = User(name='Peter', department=engineering, role=engineer)
    db_session.add(peter)
    roy = User(name='Roy', department=engineering, role=engineer)
    db_session.add(roy)
    tracy = User(name='Tracy', department=hr, role=manager)
    db_session.add(tracy)

    # Dataset
    import random
    from random import randint
    from faker import Faker
    fake = Faker('en_US')
    nPoints = 11

    # data = {'x': [randint(0, 1000) for i in range(nPoints)], 'z': [float(random.randrange(0, 1000))/100 for i in range(nPoints)], 'names': [fake.name() for i in range(nPoints)] }
    data = {
        'x': [int(i) for i in range(nPoints)],
        'z': [float(i) for i in range(nPoints)],
        'names': [fake.name() for i in range(nPoints)]
    }
    test_data1 = Dataset(name='dataset1',
                         description='First dataset',
                         table_name='data1',
                         enabled=True,
                         raw=data)
    db_session.add(test_data1)

    # data = {'x': [randint(0, 1000) for i in range(nPoints)], 'z': [float(random.randrange(0, 1000))/100 for i in range(nPoints)], 'names': [fake.name() for i in range(nPoints)] }
    # test_data2 = Dataset(name='dataset2', description='Second dataset', table_name='data2', enabled=False, raw=data)
    # db_session.add(test_data2)

    db_session.commit()
예제 #10
0
def get_dataset(datastore, id):
    '''
    Creates a dataset object from the .valid file
    '''
    try:
        valid_path = '{0}/uploads/.valid'.format(id)
        valid_file = datastore.read(valid_path)
    except AttributeError:
        return None

    if valid_file.read() == id:
        dataset = Dataset(id)
        dataset.datastore = datastore
        return dataset
예제 #11
0
def dataset():
    print(request.method)
    form = Dataset(request.form)
    #root = Tk()
    #root.filename =  filedialog.asksaveasfilename(initialdir = "/",title = "Select file",filetypes = (("jpeg files","*.jpg"),("all files","*.*")))
    #name = asksaveasfilename()
    #with open(name + '.csv', 'w', newline='') as csvfile:
    #   create = csv.writer(csvfile)
    #  create.writerow(["adam","deboosere"])

    if not form.validate_on_submit():
        return render_template('dataset.html', form=form)
    if request.method == 'POST':
        #return 'Submitted!'
        render_template('dataset.html', form=form)
예제 #12
0
def new_dataset():
    '''
    Create a unique url for this dataset to work under
    Create a folder on S3 using this url
    '''
    # Make a new dataset object
    id = str(uuid.uuid4())
    dataset = Dataset(id)
    dataset.datastore = make_datastore(app.config['DATASTORE'])

    # Write a verifying file to prove we created these folders
    validname = '{0}/uploads/.valid'.format(dataset.id)
    dataset.datastore.write(validname, StringIO(dataset.id))

    return redirect('/datasets/' + dataset.id)
예제 #13
0
def initvalidateddataset():
    lines = list(open(data_validated_csv).readlines())

    for line in lines:
        audio_path,lenght,text = line.split(',')
        text = text.replace('\n','')
        new_data= Dataset()
        new_data.text = text
        new_data.audio_lenght = lenght
        new_data.file_path= audio_path
        new_data.file_with_user = 0 # 1 if user validating this instance
        new_data.instance_validated = 1 #1 if human validated this instance
        new_data.instance_valid = 1 # 1 if instance is ok
        new_data.user_validated = 'edresson'
        db.session.add(new_data)
    db.session.commit()
예제 #14
0
    def add_dataset():
        body = request.get_json()

        name = body.get('name')
        type = body.get('type')
        description = body.get('description')
        provider_id = body.get('provider_id')

        try:
            dataset = Dataset(name, provider_id, type, description)
            dataset.insert()

            return jsonify({'success': True, 'dataset_id': dataset.id})
        except Exception as es:
            print(es)
            abort(422)
예제 #15
0
def example_1_2_6():
    # 数据预处理
    meta, data= Dataset().at(4, xy=False)
    mapper = collections.Counter(data['id'])
    for id in mapper:
        index = data['id'] == id
        data.loc[index, 'weight'] = data.loc[index, 'weight'].mean()
    # 线性模型的符号表示
    symbol = Symbol(meta, data)
    symbols = (
        symbol['weight']
        ==
        symbol['gender'] + symbol['dose'] +
        symbol['id'].apply(lambda xs: [mapper[x] for x in xs], 'size')
    )
    return symbols
예제 #16
0
    def diff_datasets(dataset_a=None, dataset_b=None):
        """
        Compara dos datasets y retorna la diferencia aditiva de ambos.

        Cuando se realiza la diferencia, el valor que prevalece es el de
        dataset_b.

        Args:
        ====
            - dataset_a:
                - Dataset().
                - Solo admite ser de tipo Dataset().

            - dataset_b:
                - Dataset().
                - Solo admite ser de tipo Dataset().

        Returns:
        =======
            - Dataset().

        Exceptions:
        ==========
            TypeError:
                - Uno o ambos argumentos, no son de clase Dataset.
        """
        from models import Dataset
        for v in [dataset_a, dataset_b]:
            if not isinstance(v, Dataset):
                raise TypeError(
                    'Para comparar los datasets ambos deben ser de clase Dataset.'
                )
        diff_ds = {}
        omit_this_keys = ['required_keys', 'context']
        for k, v in dataset_a.__dict__.items():
            if k not in omit_this_keys:
                if v != dataset_b.__dict__[k]:
                    diff_ds.update({
                        k:
                        dataset_b.__dict__[k]
                        if len(dataset_b.__dict__[k]) > 0 else v
                    })
                else:
                    diff_ds.update({k: v})
        return Dataset(datadict=diff_ds,
                       _distributions=dataset_a.__dict__['resources'],
                       _distribution_literal=True)
예제 #17
0
    def freeze_dataset(self, id_or_name):
        """
        Crea una imagen temporal del contenido de un dataset.

        Args:
        ====
            - id_or_name:
                - str().
                - Id o Nombre del dataset que deseo freezar.
        Returns:
        =======
            - Dataset: Si el objeto es localizable & "Freezable".

        Exceptions:
        ==========
            - ValueError:
                - id_or_name esta unicode o str pero es del len == 0.
            - TypeError:
                - id_or_name no es un str o unicode.
        """
        from models import Dataset
        stored_dataset = self.retrieve_dataset_metadata(id_or_name)
        if stored_dataset:
            freezed_dataset = {
                "license_title": stored_dataset['license_title'],
                "maintainer": stored_dataset['maintainer'],
                "private": stored_dataset['private'],
                "maintainer_email": stored_dataset['maintainer_email'],
                "id": stored_dataset['id'],
                "owner_org": stored_dataset['owner_org'],
                "author": stored_dataset['author'],
                "isopen": stored_dataset['isopen'],
                "author_email": stored_dataset['author_email'],
                "state": stored_dataset['state'],
                "license_id": stored_dataset['license_id'],
                "type": stored_dataset['type'],
                "groups": [g['name'] for g in stored_dataset['groups']],
                "creator_user_id": stored_dataset['creator_user_id'],
                "name": stored_dataset['name'],
                "url": stored_dataset['url'],
                "notes": stored_dataset['notes'],
                "title": stored_dataset['title'],
                "license_url": stored_dataset['license_url']
            }
            return Dataset(datadict=freezed_dataset,
                           _distribution_literal=True,
                           _distributions=stored_dataset['resources'])
예제 #18
0
def add_testdata_to_db(dataset, items, datatype):
    count = db.session.query(
        Dataset, Dataset.name).filter(Dataset.name == dataset).all()
    if len(count) > 0:
        return 'exists'

    new_dataset = Dataset(name=dataset, datatype=datatype)
    for item in items:
        testitem = Item(
            dataset_name=dataset,
            item=json.dumps(item),
            status='available',
            timestamp=datetime.now(),
        )
        new_dataset.items.append(testitem)
        db.session.add(testitem)
    db.session.add(new_dataset)
    db.session.commit()
    return 'added'
예제 #19
0
def datasets():
	if not current_user.is_authenticated:
		return redirect(url_for('no_access'))

	datasets = Dataset.query.filter(Dataset.user_id == current_user.id).all()

	for ds in datasets:
		ds.distinctive_name = ds.distinctive_name or ds.filename
		if ds.distinctive_name == ds.filename:
			ds.display_filename = ''
		else: 
			ds.display_filename = ds.filename
			
	model = {
		'title': 'Datasets',
		'datasets': datasets
	}
	form = FileUploadForm()
	if form.validate_on_submit():

		dsFile = form.fileName.data

		separator = form.separator.data
		distinctive_name = form.distinctive_name.data

		filename = secure_filename(dsFile.filename)
		guid = str(uuid.uuid4())
		
		dsFile.seek(0)
		dt = dsFile.read()
		
		dbDs = Dataset(filename, guid, g.user, datetime.datetime.utcnow(), separator, distinctive_name, dt)
		
		db.session.add(dbDs)
		db.session.commit()
		return redirect(url_for('datasets'))
	
	model['form'] = form

	return render_template('datasets.html', model = model)
예제 #20
0
    def post(self):
        urlfetch.set_default_fetch_deadline(60)
        self.response.headers['Content-Type'] = 'application/json'

        q = "select gbifdatasetid, icode, orgname, github_orgname, " \
            "source_url, github_reponame, url, gbifpublisherid " \
            "from resource_staging " \
            "where ipt=true and networks like '%VertNet%'"
        resources = carto_query(q)

        ds = []
        for resource in resources:
            ds.append(Dataset(id=resource['gbifdatasetid'], **resource))

        keys = ndb.put_multi(ds)

        result = {
            "datasets processed": len(keys),
            "message": "success"
        }

        self.response.write(json.dumps(result))
        return
예제 #21
0
def dataset_upload():
    form = DatasetForm()
    if form.validate_on_submit():
        upload = form.file.data
        name, ext = os.path.splitext(upload.filename)

        acceptable = ['.jpg', '.jpeg', '.png']
        label_acceptable = ['.csv']

        def unarchive_blob(item, dset, tmpd, archive):
            archive.extract(item, tmpd)
            # TODO: change to check if path contains valid image
            blob = Blob(os.path.join(str(tmpd), item.filename))
            dset.blobs.append(blob)
            return

        def list_blob(url):
            _, ext = os.path.splitext(url)
            if ext in acceptable:
                # TODO: change to check if url contains valid image
                blob = Blob(url)
                dset.blobs.append(blob)
            return

        def keyword_dataset(kw, item, dset, tmpd, archive):
            archive.extract(item, tmpd)
            kw_fname = str(tmpd) + "/" + item.filename
            k = Keyword(name=kw[1:], defn_file=kw_fname, dataset=dset)
            return

        dset = None
        if ext == ".zip":
            with zipfile.ZipFile(upload, 'r') as myzip:
                tmpd = tempfile.mkdtemp(dir=config.DATASET_DIR,
                                        prefix="dataset")
                dset = Dataset(name=name)
                db.session.add(dset)

                for item in myzip.infolist():
                    fname, ext = os.path.splitext(item.filename)
                    if "__MACOSX" in item.filename:
                        continue
                    kw = os.path.basename(fname)
                    if ext in acceptable:
                        unarchive_blob(item, dset, tmpd, myzip)
                    elif ext in label_acceptable and kw.startswith('_'):
                        print "creating keyword: " + kw
                        keyword_dataset(kw, item, dset, tmpd, myzip)
                    elif ext == ".txt":
                        myzip.extract(item, tmpd)
                        with open(os.path.join(str(tmpd),
                                               item.filename)) as img_list:
                            for url in img_list:
                                url = url.rstrip()
                                list_blob(url)
                    elif ext == ".csv" and not kw.startswith('_'):
                        myzip.extract(item, tmpd)
                        with open(os.path.join(str(tmpd),
                                               item.filename)) as img_list:
                            for row in csv.reader(img_list):
                                for entry in row:
                                    url = as_url(entry)
                                    if url:
                                        list_blob(url)

        elif ext == ".gz" or ext == ".bz2" or ext == ".tar":
            if ext != ".tar":
                name, ext = os.path.splitext(name)

            if ext == ".tar":
                with tarfile.open(fileobj=upload) as mytar:
                    tmpd = tempfile.mkdtemp(dir=config.DATASET_DIR,
                                            prefix="dataset")
                    dset = Dataset(name=name)
                    db.session.add(dset)

                    for item in mytar:
                        if item.isreg():
                            fname, ext = os.path.splitext(item.filename)
                            if "__MACOSX" in item.filename:
                                continue
                            kw = os.path.basename(fname)
                            if ext in acceptable:
                                unarchive_blob(item, dset, tmpd, mytar)
                            if ext in label_acceptable and kw.startswith('_'):
                                keyword_dataset(kw, item, dset, tmpd, mytar)
                            elif ext == ".txt":
                                mytar.extract(item, tmpd)
                                with open(
                                        os.path.join(
                                            str(tmpd),
                                            item.filename)) as img_list:
                                    for url in img_list:
                                        url = url.rstrip()
                                        list_blob(url)
                            elif ext == ".csv" and not kw.startswith('_'):
                                mytar.extract(item, tmpd)
                                with open(
                                        os.path.join(
                                            str(tmpd),
                                            item.filename)) as img_list:
                                    for row in csv.reader(img_list):
                                        for entry in row:
                                            url = as_url(entry)
                                            if url:
                                                list_blob(url)
        elif ext == ".txt":
            dset = Dataset(name=name)
            db.session.add(dset)
            for url in upload:
                url = url.rstrip()
                list_blob(url)
        elif ext == ".csv":
            dset = Dataset(name=name)
            db.session.add(dset)
            for row in csv.reader(upload):
                for entry in row:
                    url = as_url(entry)
                    if url:
                        list_blob(url)

        if dset != None:
            if form.patchspec.data:
                dset.patchspecs.append(form.patchspec.data)
            if form.featurespec.data:
                dset.featurespecs.append(form.featurespec.data)
            db.session.commit()
            tasks.dataset.delay(dset.id)
            return jsonify(name=dset.name, id=dset.id, url=dset.url)
    else:
        print form.errors
        return jsonify(errors=form.file.errors)
예제 #22
0
파일: views.py 프로젝트: hdubey/bigdata
def dataset_upload(request):

    user = request.user

    if request.method == 'POST':
        if user.is_authenticated():

            file = request.FILES.get('filename', '')

            file_name = file.name
            dest_dir = os.path.join(settings.USR_DATASET_ROOT, user.username)
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)

            full_path = os.path.join(dest_dir, file_name)
            rel_path = os.path.join(user.username, file_name)
            destination = open(full_path, "wb+")
            for chunk in file.chunks():
                destination.write(chunk)
            destination.close()

            description = request.POST['description']
            access = request.POST['access']
            tbl_separator = {
                "tab": '\t',
                "space": ' ',
                "comma": ',',
                "semicolon": ';'
            }
            sep_str = request.POST['sep']
            sep = tbl_separator[sep_str]
            header = request.POST['header']
            if header == 'yes':
                header = True
            elif header == 'no':
                header = False

            ## a simple check
            size = 0
            for line in open(full_path):
                size += 1
            dim = len(line.split(sep))
            if header == True:
                size -= 1  # exclude the header line

            new_dataset = Dataset(owner=user,
                                  path=rel_path,
                                  name=file_name,
                                  dim=dim,
                                  size=size,
                                  description=description,
                                  access=access,
                                  sep=sep_str,
                                  header=header)
            new_dataset.save()

            notice = "Congratulations! Your dataset has been successfully uploaded."
            # return render_to_response('dataset/success.html', RequestContext(request, {'dataset': new_dataset, 'notice': notice}))
            return HttpResponseRedirect('/datasets/%s/' % new_dataset.id)

        else:
            notice = "You must be logged in to upload datasets"
            form = UploadDatasetForm()
            return render_to_response(
                'dataset/upload.html',
                RequestContext(request, {
                    'form': form,
                    'notice': notice
                }))

    else:
        form = UploadDatasetForm()
        return render_to_response('dataset/upload.html',
                                  RequestContext(request, {'form': form}))
예제 #23
0
    "url": 'http://181.209.63.71/dataset/6897d435-8084-4685-b8ce-304b190755e4/resource/6145bf1c-a2fb-4bb5-b090-bb25f8419198/download/estructura-organica-3.csv',
    "name": 'Test At: n{}hrs.'.format(arrow.now().format('HH:mm'))}

dataset = {"license_title": "Creative Commons Attribution",
           "maintainer": "Jose A. Salgado(M)",
           "private": False,
           "maintainer_email": "*****@*****.**",
           "id": "",
           "owner_org": "99920e14-6146-4cd1-8e57-d9d8c3b3190b",
           "author": "Jose A. Salgado",
           "author_email": "*****@*****.**",
           "state": "active",
           "license_id": "cc-by",
           "type": "dataset",
           "groups": [],
           "name": "",
           "isopen": True,
           "url": "",
           "notes": "Dataset de prueba para testear la colocacion de puntos sobre un mapa de la IGN",
           "title": "Rocket Science",
           "license_url": "http://www.opendefinition.org/licenses/cc-by"}
d = Distribution(datadict=dist)
my_dataset = Dataset(datadict=dataset, _distributions=d)
if cu.save(my_dataset,
           only_metadata=True,
           _views=True):
    print "Dataset salvado con exito!!"
else:
    print "Oops... algo se rompio..."

예제 #24
0
    file_dataset = "df_σ02_350_08Х18Н10Т.json"
    target_mech = "σ0,2_350"
    norm_mech = "σ0,2_350_norm"
    target = "is_defect"

    with open(file_dataset, 'r') as f:
        df = pd.DataFrame(json.loads(f.read()))

    print("Dataset: read is done!")

    output = defaultdict(list)

    for thr in tqdm.tqdm([1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09, 1.1], desc="Thr"):
        df_train = df.assign(is_defect=lambda row: (row[target_mech] - thr * row[norm_mech] < 0).astype(int)).drop([target_mech, norm_mech], axis=1)
        share = df_train[target].mean()
        d = Dataset(data=json.dumps(df_train.select_dtypes(np.number).to_dict('records')),
                    features=df_train.select_dtypes(np.number).drop(target, axis=1).columns, target=target)
        m = MlModel(model_type='RandomForestClassifier')
        search_space = OptParams(model_type=type(m.get_model()).__name__)
        opt = Opt(data=d,
                  params=search_space,
                  pipeline=m,
                  metric=partial(precision_score, zero_division=0),
                  trials=Trials()
                  )

        opt.start_opt()

        output['thr'] += [thr]
        output['share'] += [share]
        output['best_trial'] += [opt.trials.best_trial['result']]
예제 #25
0
import textwrap

from models import Dataset

dataset = Dataset()
attribute_mapper = {
    'balance': '平衡数据',
    'response_type': '响应变量的数据类型',
}
xyz_mapper = {
    't': '时间变量',
    'x': '协变量',
    'y': '响应变量',
    'z': '区分个体变量',
}

for meta, data in dataset:
    if len(data.columns) > 8:
        continue
    print('\\subsubsection{{{}}}'.format(meta['title']))
    print(meta['description'] + '\n')

    print('\\begin{itemize}')

    print('    \\item 表头说明:')
    print('        \\begin{enumerate*}[label=(\\alph*), itemjoin={;}]')
    for x, y in meta['header'].items():
        x = x.replace('_', '\_')
        print(f'            \\item {x},{y}')
    print('        \\end{enumerate*}。' + '\n')
예제 #26
0
def hazardous_waste(year=2011, verbose=True):
    try:
        dataset = Dataset.objects.get(name="Hazardous Waste Sites "+str(year))
        dataset.cached = datetime.utcnow().replace(tzinfo=utc)
    except ObjectDoesNotExist:
        coor = GeoCoordinates(lat_field="Latitude",
                              lon_field="Longitude")
        coor.save()
        names = DatasetNameField(field1_en="Generator Status",
                                 field1_name="Generator Status",
                                 field2_en="Biennial Report Link",
                                 field2_name="Biennial Report Link")
        names.save()
        location = Location(street_field="Address",
                            city_field="City",
                            state_field="State",
                            zipcode_field="ZIP Code",
                            county_field="County")
        dataset = Dataset(
            name="Hazardous Waste Sites "+str(year),
            url='/data/ej/'+str(year)+'/',
            cached=datetime.utcnow().replace(tzinfo=utc),
            cache_max_age=1000,
            remote_id_field="Handler ID",
            name_field="Handler Name",
            location=location,
            coordinates=coor,
            names=names
            needs_geocoding=False)
    dataset.save()

    MapPoint.objects.filter(dataset=dataset).delete()

    for state in ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE',
                  'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA',
                  'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN',
                  'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM',
                  'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI',
                  'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA',
                  'WV', 'WI', 'WY']:
        short_name = 'Envirofacts_Biennial_Report_Search ' + state + '.CSV'
        path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/ej/'+str(year)+'/'+short_name))
        if not os.path.isfile(path):
            if verbose:
                print 'No file %s exists.' % (short_name)
            short_name = str(year)+' '+state+'.CSV'
            path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/ej/'+str(year)+'/'+short_name))
            if not os.path.isfile(path):
                if verbose:
                    print 'No file %s exists.' % (short_name)
                continue
        if verbose:
            print 'Opening file %s' % (short_name)
        readfile = csv.reader(open(path, 'rb'))
        # verify
        row = readfile.next()
        locs = {}
        for i in range(len(row)):
            if row[i] == dataset.remote_id_field:
                locs['remote_id'] = i
            elif row[i] == dataset.name_field:
                locs['name'] = i
            elif row[i] == dataset.location.street_field:
                locs['street'] = i
            elif row[i] == dataset.location.city_field:
                locs['city'] = i
            elif row[i] == dataset.location.state_field:
                locs['state'] = i
            elif row[i] == dataset.location.zipcode_field:
                locs['zipcode'] = i
            elif row[i] == dataset.location.county_field:
                locs['county'] = i
            elif row[i] == dataset.coordinates.lat_field:
                locs['lat'] = i
            elif row[i] == dataset.coordinates.lon_field:
                locs['lon'] = i
            elif row[i] == dataset.names.field1_name:
                locs['field1'] = i
            elif row[i] == dataset.names.field2_name:
                locs['field2'] = i
        for row in readfile:
            kwargs = {'dataset': dataset}
            for key in locs:
                if key in ['lat', 'lon']:
                    try:
                        kwargs[key] = float(row[locs[key]])
                    except Exception:
                        kwargs[key] = 0.
                elif MapPoint._meta.get_field(key).max_length < len(row[locs[key]]):
                    kwargs[key] = row[locs[key]][:MapPoint._meta.get_field(key).max_length]
                else:
                    kwargs[key] = row[locs[key]]
            try:
                kwargs['point'] = Point(kwargs['lon'], kwargs['lat'])
            except Exception:
                if verbose:
                    print '\tInvalid lat/long for row: %s' % (row)
                    print '\tLat: %f Lon: %f' % (kwargs['lat'], kwargs['lon'])
                continue
            mp = MapPoint(**kwargs)
            mp.save()
        if verbose:
            print 'File "%s" done processing' % (short_name)
예제 #27
0
def run(verbose=True, year=2010, starting_state=1):
    yn = ''
    # https://docs.djangoproject.com/en/1.7/ref/contrib/gis/layermapping/
    while DEBUG and yn != 'y':
        yn = raw_input('This process can be memory-intensive if'
                       'DEBUG = True in settings as this logs all SQL. '
                       'DEBUG is currently True. Please set this to False'
                       'if you are experiencing issues. Continue (y/n)?') \
                       .lower().strip()
        if yn == 'n':
            return
    dataset_qs = Dataset.objects.filter(name__exact=str(year)+' Census Tracts')
    if len(dataset_qs) > 0:
        ds = dataset_qs[0]
        ds.cached = datetime.utcnow().replace(tzinfo=utc),
    else:
        coor = GeoCoordinates(lat_field='INTPTLAT'+str(year)[-2:],
                              lon_field='INTPTLON'+str(year)[-2:])
        coor.save()
        names = DatasetNameField(field1_en='Land Area',
                                 field1_name='ALAND'+str(year)[-2:],
                                 field2_en='Water Area',
                                 field2_name='AWATER'+str(year)[-2:])
        names.save()
        ds = Dataset(name=str(year)+' Census Tracts',
                     cached=datetime.utcnow().replace(tzinfo=utc),
                     cache_max_age=1000,
                     name_field='NAMELSAD'+str(year)[-2:],
                     coordinates=coor,
                     names=names)
        if year == 2010:
            ds.remote_id_field = 'GEOID00'
        elif year == 2000:
            ds.remote_id_field = 'CTIDFP00'
        ds.save()

    tract_mapping = {
        'remote_id': ds.remote_id_field,
        'name': ds.name_field,
        'lat': ds.coordinates.lat_field,
        'lon': ds.coordinates.lon_field,
        'field1': ds.names.field1_name,
        'field2': ds.names.field2_name,
        'mpoly': 'MULTIPOLYGON',
    }

    ftp = ftplib.FTP('ftp2.census.gov')
    ftp.login()
    ftp.cwd("/geo/tiger/TIGER2010/TRACT/" + str(year) + "/")
    files = ftp.nlst()

    MapPolygon.objects.filter(dataset_id__isnull=True).delete()
    max_state = MapPolygon.objects.filter(dataset_id__exact=ds.id).aggregate(Max('remote_id'))
    max_state = max_state['remote_id__max']
    if max_state is not None:
        try:
            max_state = int(max_state)/1000000000
            if max_state >= starting_state:
                starting_state = max_state + 1
        except Exception:
            pass

    for i in [format(x, '#02d') for x in range(starting_state, 100)]:
        short_name = 'tl_2010_' + i + '_tract' + str(year)[-2:]
        tract_shp = os.path.abspath(os.path.join(os.path.dirname(__file__),
                                    'data/'+short_name))
        if (not os.path.isfile(tract_shp+'.shp')
            or not os.path.isfile(tract_shp+'.shx')
            or not os.path.isfile(tract_shp+'.shp.xml')
            or not os.path.isfile(tract_shp+'.prj')
            or not os.path.isfile(tract_shp+'.dbf')):

            if short_name + '.zip' not in files:
                continue
            if verbose:
                print short_name + '.shp does not exist locally.\n\tDownloading from Census FTP...'
            try:
                # download the file
                local_file = open(tract_shp+'.zip', 'wb')
                ftp.retrbinary('RETR '+short_name+'.zip', local_file.write)
                local_file.close()
                # open the zip
                zipped = zipfile.ZipFile(tract_shp+'.zip')
                for suffix in ['.shp', '.prj', '.dbf', '.shp.xml', '.shx']:
                    zipped.extract(short_name+suffix, os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')))
            except Exception as inst:
                if verbose:
                    print '\tException:', inst
                    print '\t'+short_name + '.shp did not download or unzip correctly. Moving on...'
                continue
        tract_shp = tract_shp + '.shp'
        if verbose:
            print '\tBegin layer mapping...'
        lm = LayerMapping(MapPolygon, tract_shp, tract_mapping, transform=False, encoding='iso-8859-1')

        while True:
            try:
                lm.save(strict=True, verbose=False)  # verbose)
                break
            # exception part is untested, error didn't happen again
            except Exception as inst:
                yn = ''
                while yn not in ['n', 'y']:
                    yn = raw_input('Error saving: ' + str(inst) + '\nContinue (y/n)?').strip().lower()
                if yn == 'y':
                    MapPolygon.objects.filter(dataset_id__isnull=True).filter(remote_id__startswith=i).delete()
                else:
                    break
        if verbose:
            print '\tLayer mapping done.'
        MapPolygon.objects.filter(dataset=None).update(dataset=ds)
        if verbose:
            print '\tLayer associated with dataset.'
    ftp.quit()

    if verbose:
        print 'All shapefiles added.'
예제 #28
0
def upload():
    #try:
    file = request.files['file']
    filename = file.filename
    if filename == '':
        raise ValueError('No file uploaded!!')
    file_uploads_path = os.path.join(config.UPLOADS_DIR, filename)
    file_static_path = os.path.join(config.STATIC_DIR, 'output')
    file_static_path = os.path.join(file_static_path, filename)
    file.save(file_uploads_path)
    cleaned_file = ''
    if (filename.rsplit('.', 1)[1].lower() == 'csv'):
        dirty_file = pd.read_csv(file_uploads_path, sep=',')
        res = data_cleaning.id_classLabel_check(dirty_file)
        if (res != True):
            raise ValueError(res)
        missing_val_fixed_file = data_cleaning.fix_missing(
            dirty_file, request.form['fix'])
        cleaned_file = data_cleaning.clean(missing_val_fixed_file)
        cleaned_file.to_csv(file_uploads_path, sep=',', index=False)
    elif (filename.rsplit('.', 1)[1].lower() == 'tsv'):
        dirty_file = pd.read_csv(file_uploads_path, sep='\t')
        missing_val_fixed_file = data_cleaning.fix_missing(
            dirty_file, request.form['fix'])
        cleaned_file = data_cleaning.clean(missing_val_fixed_file)
        cleaned_file.to_csv(file_uploads_path, sep=',', index=False)
    elif (filename.rsplit('.', 1)[1].lower() == 'json'):
        print(str(file_uploads_path))
        dirty_file = pd.read_json(str(file_uploads_path))
        missing_val_fixed_file = data_cleaning.fix_missing(
            dirty_file, request.form['fix'])
        cleaned_file = data_cleaning.clean(missing_val_fixed_file)
        cleaned_file.to_json(file_uploads_path)
    else:
        raise ValueError(
            'Invalid file input! Please check the input file type')
    #cleaned_file =cleaned_file.sort_values(by=['classLabel'])
    #cleaned_file.to_csv('static/test2.csv')
    download_path = 'static/uploads/' + filename
    session['filename'] = filename

    #X =  Dataset.query.filter_by(name = filename)
    #for i in X:
    #    print(cPickle.loads(i.content))

    #SAVING INPUT DATASET TO DATABASE
    try:
        #print(cleaned_file)
        serialized_content = cPickle.dumps(cleaned_file)
        #session['cleaned_file'] = serialized_content
        existingDataset = Dataset.query.filter_by(name=filename).all()
        for data in existingDataset:
            db.session.delete(data)

        dataset = Dataset(filename, download_path, serialized_content)
        db.session.add(dataset)
        db.session.commit()
    except Exception as e:

        print(e)
        #raise ValueError('Dataset with this name already exist in database!. Please update dataset name')

    del cleaned_file['classLabel']
    cleaned_file.index = cleaned_file['id']
    del cleaned_file['id']
    #paramObj = heidi_api.getAllSubspaces(cleaned_file, filename)

    #return render_template('success.html', download_path=download_path, user=current_user)
    #return render_template('dimension_new.html', title = 'visual tool', user = current_user, paramObj = paramObj) #title='dimension Visualization',datasetName=datasetName,user=current_user, dimensions=['a','b','c'])
    return redirect(url_for('heidi_controllers.interactive_heidi'))