Exemplo n.º 1
0
  def test_guess_format_invalid_csv_format(self):
    indexer = MorphlineIndexer("test", solr_client=self.solr_client)
    stream = StringIO.StringIO(TestIndexer.simpleCSVString)

    guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    guessed_format["fieldSeparator"] = "invalid separator"

    fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns']
    assert_equal(fields, [])

    stream.seek(0)
    guessed_format = indexer.guess_format({'file':  {"stream": stream, "name": "test.csv"}})

    guessed_format["recordSeparator"] = "invalid separator"

    fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns']
    assert_equal(fields, [])

    stream.seek(0)
    guessed_format = indexer.guess_format({'file':  {"stream": stream, "name": "test.csv"}})

    guessed_format["quoteChar"] = "invalid quoteChar"

    fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns']
    assert_equal(fields, [])
Exemplo n.º 2
0
    def test_guess_csv_format(self):
        stream = StringIO.StringIO(TestIndexer.simpleCSVString)
        indexer = MorphlineIndexer("test", solr_client=self.solr_client)

        guessed_format = indexer.guess_format(
            {'file': {
                "stream": stream,
                "name": "test.csv"
            }})

        fields = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": "test.csv"
            },
            "format": guessed_format
        })['columns']
        # test format
        expected_format = self.simpleCSVFormat

        assert_equal(expected_format, guessed_format)

        # test fields
        expected_fields = self.simpleCSVFields

        for expected, actual in zip(expected_fields, fields):
            for key in ("name", "type"):
                assert_equal(expected[key], actual[key])
Exemplo n.º 3
0
Arquivo: api3.py Projeto: dulems/hue-1
def guess_format(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'file':
    indexer = MorphlineIndexer(request.user, request.fs)
    if not request.fs.isfile(file_format["path"]):
      raise PopupException(_('Path %(path)s is not a file') % file_format)

    stream = request.fs.open(file_format["path"])
    format_ = indexer.guess_format({
      "file": {
        "stream": stream,
        "name": file_format['path']
      }
    })
    _convert_format(format_)
  elif file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])

    storage = dict([(delim['data_type'], delim['comment']) for delim in table_metadata.storage_details])
    if table_metadata.details['properties']['format'] == 'text':
      format_ = {"quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage['serialization.format']}
    elif table_metadata.details['properties']['format'] == 'parquet':
      format_ = {"type": "parquet", "hasHeader": False,}
    else:
      raise PopupException('Hive table format %s is not supported.' % table_metadata.details['properties']['format'])
  elif file_format['inputFormat'] == 'query':
    format_ = {"quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\u0001"}
  elif file_format['inputFormat'] == 'rdbms':
    format_ = RdbmsIndexer(request.user, file_format['rdbmsType']).guess_format()

  format_['status'] = 0
  return JsonResponse(format_)
Exemplo n.º 4
0
def guess_format(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        path = urllib.unquote(file_format["path"])
        indexer = MorphlineIndexer(request.user, request.fs)
        if not request.fs.isfile(path):
            raise PopupException(
                _('Path %(path)s is not a file') % file_format)

        stream = request.fs.open(path)
        format_ = indexer.guess_format(
            {"file": {
                "stream": stream,
                "name": path
            }})
        _convert_format(format_)
    elif file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        try:
            table_metadata = db.get_table(database=file_format['databaseName'],
                                          table_name=file_format['tableName'])
        except Exception, e:
            raise PopupException(
                e.message if hasattr(e, 'message') and e.message else e)
        storage = {}
        for delim in table_metadata.storage_details:
            if delim['data_type']:
                if '=' in delim['data_type']:
                    key, val = delim['data_type'].split('=', 1)
                    storage[key] = val
                else:
                    storage[delim['data_type']] = delim['comment']
        if table_metadata.details['properties']['format'] == 'text':
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "csv",
                "hasHeader": False,
                "fieldSeparator": storage.get('field.delim', ',')
            }
        elif table_metadata.details['properties']['format'] == 'parquet':
            format_ = {
                "type": "parquet",
                "hasHeader": False,
            }
        else:
            raise PopupException(
                'Hive table format %s is not supported.' %
                table_metadata.details['properties']['format'])
Exemplo n.º 5
0
  def test_end_to_end(self):
    if not is_live_cluster(): # Skipping as requires morplines libs to be setup
      raise SkipTest()

    cluster = shared_cluster()
    fs = cluster.fs
    make_logged_in_client(username="******", groupname="default", recreate=True, is_superuser=False)
    user = User.objects.get(username="******")
    collection_name = "test_collection"
    indexer = MorphlineIndexer("test", fs=fs, jt=cluster.jt, solr_client=self.solr_client)
    input_loc = "/tmp/test.csv"

    # upload the test file to hdfs
    fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True)

    # open a filestream for the file on hdfs
    stream = fs.open(input_loc)

    # guess the format of the file
    file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format})

    format_ = field_types.copy()
    format_['format'] = file_type_format

    # find a field name available to use for the record's uuid
    unique_field = indexer.get_unique_field(format_)
    is_unique_generated = indexer.is_unique_generated(format_)

    # generate morphline
    morphline = indexer.generate_morphline_config(collection_name, format_, unique_field)

    schema_fields = indexer.get_kept_field_list(format_['columns'])
    if is_unique_generated:
      schema_fields += [{"name": unique_field, "type": "string"}]


    # create the collection from the specified fields
    collection_manager = CollectionManagerController("test")
    if collection_manager.collection_exists(collection_name):
      collection_manager.delete_collection(collection_name, None)
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

    # index the file
    indexer.run_morphline(MockedRequest(user=user, fs=cluster.fs, jt=cluster.jt), collection_name, morphline, input_loc)
Exemplo n.º 6
0
Arquivo: api3.py Projeto: hkj123/hue
def guess_format(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        path = urllib_unquote(file_format["path"])
        indexer = MorphlineIndexer(request.user, request.fs)
        if not request.fs.isfile(path):
            raise PopupException(
                _('Path %(path)s is not a file') % file_format)

        stream = request.fs.open(path)
        format_ = indexer.guess_format(
            {"file": {
                "stream": stream,
                "name": path
            }})
        _convert_format(format_)
    elif file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        try:
            table_metadata = db.get_table(database=file_format['databaseName'],
                                          table_name=file_format['tableName'])
        except Exception as e:
            raise PopupException(
                e.message if hasattr(e, 'message') and e.message else e)
        storage = {}
        for delim in table_metadata.storage_details:
            if delim['data_type']:
                if '=' in delim['data_type']:
                    key, val = delim['data_type'].split('=', 1)
                    storage[key] = val
                else:
                    storage[delim['data_type']] = delim['comment']
        if table_metadata.details['properties']['format'] == 'text':
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "csv",
                "hasHeader": False,
                "fieldSeparator": storage.get('field.delim', ',')
            }
        elif table_metadata.details['properties']['format'] == 'parquet':
            format_ = {
                "type": "parquet",
                "hasHeader": False,
            }
        else:
            raise PopupException(
                'Hive table format %s is not supported.' %
                table_metadata.details['properties']['format'])
    elif file_format['inputFormat'] == 'query':
        format_ = {
            "quoteChar": "\"",
            "recordSeparator": "\\n",
            "type": "csv",
            "hasHeader": False,
            "fieldSeparator": "\u0001"
        }
    elif file_format['inputFormat'] == 'rdbms':
        format_ = {"type": "csv"}
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            format_ = {
                "type": "json",
                # "fieldSeparator": ",",
                # "hasHeader": True,
                # "quoteChar": "\"",
                # "recordSeparator": "\\n",
                'topics': get_topics(request.user)
            }
        elif file_format['streamSelection'] == 'flume':
            format_ = {
                "type": "csv",
                "fieldSeparator": ",",
                "hasHeader": True,
                "quoteChar": "\"",
                "recordSeparator": "\\n"
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            format_ = {
                "type":
                "csv",
                "fieldSeparator":
                ",",
                "hasHeader":
                True,
                "quoteChar":
                "\"",
                "recordSeparator":
                "\\n",
                'objects': [
                    sobject['name']
                    for sobject in sf.restful('sobjects/')['sobjects']
                    if sobject['queryable']
                ]
            }
        else:
            raise PopupException(
                _('Input format %(inputFormat)s connector not recognized: $(connectorSelection)s'
                  ) % file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    format_['status'] = 0
    return JsonResponse(format_)
Exemplo n.º 7
0
Arquivo: api3.py Projeto: mapr/hue
def guess_format(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))
    file_type = file_format['file_type']
    path = urllib_unquote(file_format["path"])

    if sys.version_info[0] < 3 and (file_type == 'excel' or path[-3:] == 'xls'
                                    or path[-4:] == 'xlsx'):
        return JsonResponse({
            'status':
            -1,
            'message':
            'Python2 based Hue does not support Excel file importer'
        })

    if file_format['inputFormat'] == 'localfile':
        if file_type == 'excel':
            format_ = {"type": "excel", "hasHeader": True}
        else:
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "csv",
                "hasHeader": True,
                "fieldSeparator": ","
            }

    elif file_format['inputFormat'] == 'file':
        if path[-3:] == 'xls' or path[-4:] == 'xlsx':
            file_obj = request.fs.open(path)
            if path[-3:] == 'xls':
                df = pd.read_excel(file_obj.read(1024 * 1024 * 1024),
                                   engine='xlrd')
            else:
                df = pd.read_excel(file_obj.read(1024 * 1024 * 1024),
                                   engine='openpyxl')
            _csv_data = df.to_csv(index=False)

            path = excel_to_csv_file_name_change(path)
            request.fs.create(path, overwrite=True, data=_csv_data)

        indexer = MorphlineIndexer(request.user, request.fs)
        if not request.fs.isfile(path):
            raise PopupException(
                _('Path %(path)s is not a file') % file_format)

        stream = request.fs.open(path)
        format_ = indexer.guess_format(
            {"file": {
                "stream": stream,
                "name": path
            }})
        _convert_format(format_)

        if file_format["path"][-3:] == 'xls' or file_format["path"][
                -4:] == 'xlsx':
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "excel",
                "hasHeader": True,
                "fieldSeparator": ","
            }

    elif file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        try:
            table_metadata = db.get_table(database=file_format['databaseName'],
                                          table_name=file_format['tableName'])
        except Exception as e:
            raise PopupException(
                e.message if hasattr(e, 'message') and e.message else e)
        storage = {}
        for delim in table_metadata.storage_details:
            if delim['data_type']:
                if '=' in delim['data_type']:
                    key, val = delim['data_type'].split('=', 1)
                    storage[key] = val
                else:
                    storage[delim['data_type']] = delim['comment']
        if table_metadata.details['properties']['format'] == 'text':
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "csv",
                "hasHeader": False,
                "fieldSeparator": storage.get('field.delim', ',')
            }
        elif table_metadata.details['properties']['format'] == 'parquet':
            format_ = {
                "type": "parquet",
                "hasHeader": False,
            }
        else:
            raise PopupException(
                'Hive table format %s is not supported.' %
                table_metadata.details['properties']['format'])
    elif file_format['inputFormat'] == 'query':
        format_ = {
            "quoteChar": "\"",
            "recordSeparator": "\\n",
            "type": "csv",
            "hasHeader": False,
            "fieldSeparator": "\u0001"
        }
    elif file_format['inputFormat'] == 'rdbms':
        format_ = {"type": "csv"}
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            format_ = {
                "type": "json",
                # "fieldSeparator": ",",
                # "hasHeader": True,
                # "quoteChar": "\"",
                # "recordSeparator": "\\n",
                'topics': get_topics(request.user)
            }
        elif file_format['streamSelection'] == 'flume':
            format_ = {
                "type": "csv",
                "fieldSeparator": ",",
                "hasHeader": True,
                "quoteChar": "\"",
                "recordSeparator": "\\n"
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            format_ = {
                "type":
                "csv",
                "fieldSeparator":
                ",",
                "hasHeader":
                True,
                "quoteChar":
                "\"",
                "recordSeparator":
                "\\n",
                'objects': [
                    sobject['name']
                    for sobject in sf.restful('sobjects/')['sobjects']
                    if sobject['queryable']
                ]
            }
        else:
            raise PopupException(
                _('Input format %(inputFormat)s connector not recognized: $(connectorSelection)s'
                  ) % file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    format_['status'] = 0
    return JsonResponse(format_)