Exemplo n.º 1
0
def export_csv_to_rdf(proj_name, input_file, encoding, json_file, output_file):
    """Exports CSV to RDF via OpenRefine using a JSON file."""
    server = refine.RefineServer()
    LOGGER.debug("Connected to OpenRefine")

    options_json = get_options(proj_name, input_file)
    opts = {}
    new_style_options = dict(opts, **{
        'encoding': encoding,
    })
    params = {
        'options': json.dumps(new_style_options),
    }
    resp = server.urlopen('create-project-from-upload', options_json, params)
    url_params = urlparse.parse_qs(urlparse.urlparse(resp.geturl()).query)

    if 'project' in url_params:
        project_id = url_params['project'][0]
        LOGGER.debug("Created project with project id: %s", project_id)
        proj = refine.RefineProject(project_id)
        update_project_file(project_id)
    else:
        raise Exception('Project not created')

    apply_operations(proj, json_file)
    export_project(proj, output_file)
    if output_file:
        LOGGER.debug("RDF exported to: %s", output_file)
Exemplo n.º 2
0
 def setUp(self):
     self.server = refine.RefineServer()
     self.refine = refine.Refine(self.server)
     if self.project_file:
         self.project = self.refine.new_project(
             project_file=self.project_file,
             project_format=self.project_format,
             **self.project_options)
Exemplo n.º 3
0
 def test_init(self):
     server_url = 'http://' + refine.REFINE_HOST
     if refine.REFINE_PORT != '80':
         server_url += ':' + refine.REFINE_PORT
     self.assertEqual(self.server.server, server_url)
     self.assertEqual(refine.RefineServer.url(), server_url)
     # strip trailing /
     server = refine.RefineServer('http://refine.example/')
     self.assertEqual(server.server, 'http://refine.example')
Exemplo n.º 4
0
def info(project_id):
    """Show project metadata"""
    projects = refine.Refine(refine.RefineServer()).list_projects().items()
    for projects_id, projects_info in projects:
        if project_id == projects_id:
            print('{0}: {1}'.format('id', projects_id))
            print('{0}: {1}'.format('name', projects_info['name']))
            print('{0}: {1}'.format('created', projects_info['created']))
            print('{0}: {1}'.format('modified', projects_info['modified']))
Exemplo n.º 5
0
 def set_up(self):
     self.server = refine.RefineServer()
     self.refine = refine.Refine(self.server)
     if self.project_file:
         self.project = self.refine.new_project(
             project_file=self.project_path(),
             project_format=self.project_format,
             separator='|',
             **self.project_options)
Exemplo n.º 6
0
def list_projects():
    """Query the Refine server and list projects by ID: name."""
    projects = refine.Refine(refine.RefineServer()).list_projects().items()

    def date_to_epoch(json_dt):
        """Convert a JSON date time into seconds-since-epoch."""
        return time.mktime(time.strptime(json_dt, '%Y-%m-%dT%H:%M:%SZ'))

    projects.sort(key=lambda v: date_to_epoch(v[1]['modified']), reverse=True)
    for project_id, project_info in projects:
        print('{0:>14}: {1}'.format(project_id, project_info['name']))
Exemplo n.º 7
0
 def connect(self):
     try:
         refine.REFINE_HOST = settings.GOOGLE_REFINE_HOST
         refine.REFINE_PORT = settings.GOOGLE_REFINE_PORT
     except AttributeError:
         raise CommandError(
             'Set GOOGLE_REFINE_HOST and GOOGLE_REFINE_PORT in settings')
     try:
         server = refine.RefineServer()
         refine_instance = refine.Refine(server)
         server.get_version()
     except URLError:
         raise CommandError('Google Refine server is not reachable.')
     return refine_instance
Exemplo n.º 8
0
def ls():
    """Query the server and list projects sorted by mtime."""
    projects = refine.Refine(refine.RefineServer()).list_projects().items()

    def date_to_epoch(json_dt):
        """Convert a JSON date time into seconds-since-epoch."""
        return time.mktime(time.strptime(json_dt, '%Y-%m-%dT%H:%M:%SZ'))

    projects.sort(key=lambda v: date_to_epoch(v[1]['modified']), reverse=True)
    if projects:
        for project_id, project_info in projects:
            print(u'{0:>14}: {1}'.format(project_id,
                                         project_info['name']).encode('utf-8'))
    else:
        print('Error: No projects found')
Exemplo n.º 9
0
def info(project_id):
    """Show project metadata"""
    projects = refine.Refine(refine.RefineServer()).list_projects()
    if project_id in projects.keys():
        print('{0:>20}: {1}'.format('id', project_id))
        print('{0:>20}: {1}'.format(
            'url', 'http://' + refine.REFINE_HOST + ':' + refine.REFINE_PORT +
            '/project?project=' + project_id))
        for k, v in projects[project_id].items():
            if v:
                print(u'{0:>20}: {1}'.format(k, v))
        project_model = refine.RefineProject(project_id).get_models()
        columns = [c['name'] for c in project_model['columnModel']['columns']]
        for (i, v) in enumerate(columns, start=1):
            print(u'{0:>20}: {1}'.format(u'column ' + str(i).zfill(3), v))
    else:
        print('Error: No project found with id %s.\n'
              'Check existing projects with command --list' % (project_id))
Exemplo n.º 10
0
# @in to:['Waldorf Astoria','Hamburg Amerika Linie','Norddeutscher Lloyd Bremen']
# @in Json_History_id
# @in function:getToValue
# @in function:getFromValue
# @out outputFile @uri file: PartTest.tsv
# @out projectID
# @out projectNoRows


# @begin CreateProject @desc create project from file
# @in csvFile @uri file: partTest.csv
# @in refinePythonFile @uri file: refine.py
# @out projectID
# @out projectNoRows
from google.refine import refine
projectID=refine.Refine(refine.RefineServer()).new_project('partTest.csv','HalfMenuDataset','.csv')[1]
# print(refine.myParser('--list'))
# @end CreateProject


'''insert a function to automatically get 'from' '''
def getFromValue(computeCluster):
    fromlist=[]
    fromlistInner=[]
    for list3 in computeCluster:
        for list4 in list3:
            fromlistInner.append(list4['value'])
        fromlist.append(fromlistInner)
        fromlistInner=[]
    return fromlist
Exemplo n.º 11
0
def main():
    """Command line interface."""

    # get environment variables in docker network
    docker_host = os.environ.get('OPENREFINE_SERVER_PORT_3333_TCP_ADDR')
    if docker_host:
        os.environ["OPENREFINE_HOST"] = docker_host
        refine.REFINE_HOST = docker_host
    docker_port = os.environ.get('OPENREFINE_SERVER_PORT_3333_TCP_PORT')
    if docker_port:
        os.environ["OPENREFINE_PORT"] = docker_port
        refine.REFINE_PORT = docker_port

    options, args = PARSER.parse_args()
    commands_dict = {
        group2_arg.dest: getattr(options, group2_arg.dest)
        for group2_arg in group2.option_list
    }
    commands_dict.update({
        group3_arg.dest: getattr(options, group3_arg.dest)
        for group3_arg in group3.option_list
    })
    commands_dict = {k: v for k, v in commands_dict.items() if v != None}
    if not commands_dict:
        PARSER.print_usage()
        return
    if options.host:
        refine.REFINE_HOST = options.host
    if options.port:
        refine.REFINE_PORT = options.port
    if args and not str.isdigit(args[0]):
        projects = refine.Refine(refine.RefineServer()).list_projects().items()
        idlist = []
        for project_id, project_info in projects:
            if args[0] == project_info['name']:
                idlist.append(str(project_id))
        if len(idlist) > 1:
            raise Exception(
                'Found at least two projects. Please specify project by id.')
        else:
            args[0] = idlist[0]

    if options.list:
        list_projects()
    if options.create:
        create_project(options)
    if options.delete:
        project = refine.RefineProject(args[0])
        project.delete()
    if options.apply:
        project = refine.RefineProject(args[0])
        response = project.apply_operations(options.apply)
        if response != 'ok':
            print >> sys.stderr, 'Failed to apply %s: %s' \
                % (options.apply, response)
        return project
    if options.export or options.output:
        project = refine.RefineProject(args[0])
        export_project(project, options)
        return project
    if options.info:
        info(args[0])
        project = refine.RefineProject(args[0])
        return project
Exemplo n.º 12
0
def create_project(options):
    """Create a new project from options.create file."""
    # general defaults are defined in google_refine/refine/refine.py new_project
    # additional defaults for each file type
    defaults = {}
    defaults['xml'] = {'project_format': 'text/xml', 'recordPath': 'record'}
    defaults['csv'] = {
        'project_format': 'text/line-based/*sv',
        'separator': ','
    }
    defaults['tsv'] = {
        'project_format': 'text/line-based/*sv',
        'separator': '\t'
    }
    defaults['line-based'] = {
        'project_format': 'text/line-based',
        'skipDataLines': -1
    }
    defaults['fixed-width'] = {
        'project_format': 'text/line-based/fixed-width',
        'headerLines': 0
    }
    defaults['json'] = {
        'project_format': 'text/json',
        'recordPath': ('_', '_')
    }
    defaults['xls'] = {
        'project_format': 'binary/text/xml/xls/xlsx',
        'sheets': 0
    }
    defaults['xlsx'] = {
        'project_format': 'binary/text/xml/xls/xlsx',
        'sheets': 0
    }
    defaults['ods'] = {'project_format': 'text/xml/ods', 'sheets': 0}
    # guess format from file extension (or legacy option --format)
    input_format = os.path.splitext(options.create)[1][1:].lower()
    if input_format == 'txt' and options.columnWidths:
        input_format = 'fixed-width'
    if input_format == 'txt' and not options.columnWidths:
        input_format = 'line-based'
    if options.input_format:
        input_format = options.input_format
    # defaults for selected format
    input_dict = defaults[input_format]
    # user input
    input_user = {
        group4_arg.dest: getattr(options, group4_arg.dest)
        for group4_arg in group4.option_list
    }
    input_user['strings'] = {
        k: v
        for k, v in input_user.items()
        if v != None and v not in ['true', 'false']
    }
    input_user['trues'] = {
        k: True
        for k, v in input_user.items() if v == 'true'
    }
    input_user['falses'] = {
        k: False
        for k, v in input_user.items() if v == 'false'
    }
    input_user_eval = input_user['strings']
    input_user_eval.update(input_user['trues'])
    input_user_eval.update(input_user['falses'])
    # merge defaults with user input
    input_dict.update(input_user_eval)
    input_dict['project_file'] = options.create
    refine.Refine(refine.RefineServer()).new_project(**input_dict)
Exemplo n.º 13
0
def main():
    """Command line interface."""

    options = parser.parse_args()
    # set environment
    if options.host:
        refine.REFINE_HOST = options.host
    if options.port:
        refine.REFINE_PORT = options.port

    # get project_id
    if options.project_id and str.isdigit(options.project_id):
        project_id = options.project_id
    elif options.project_id:
        projects = refine.Refine(refine.RefineServer()).list_projects().items()
        idlist = []
        for project_id, project_info in projects:
            if options.project_id == project_info['name']:
                idlist.append(str(project_id))
        if len(idlist) > 1:
            print('Error: Found {idlist} projects with name {name}.\n'
                  'Please specify project by id.'.format(
                      idlist=len(idlist), name=options.project_id))
            for i in idlist:
                print('')
                cli.info(i)
            return
        else:
            try:
                project_id = idlist[0]
            except IndexError:
                print('Error: No project found with name {}.\n'
                      'Try command --list'.format(options.project_id))
                return

    # commands without args
    if options.list:
        cli.ls()
    elif options.download:
        cli.download(options.download, output_file=options.output)
    elif options.create:
        arg_dict = {
            arg.dest: getattr(options, arg.dest)
            for arg in CreateGroup._group_actions
        }
        kwargs = {
            k: v
            for k, v in arg_dict.items()
            if v is not None and v not in ['true', 'false']
        }
        kwargs.update({k: True for k, v in arg_dict.items() if v == 'true'})
        kwargs.update({k: False for k, v in arg_dict.items() if v == 'false'})
        if options.file_format:
            kwargs.update({'project_format': options.file_format})
        cli.create(options.create, **kwargs)
    # commands with args
    elif options.info:
        cli.info(project_id)
    elif options.delete:
        cli.delete(project_id)
    elif options.apply:
        cli.apply(project_id, options.apply)
    elif options.template:
        arg_dict = {
            arg.dest: getattr(options, arg.dest)
            for arg in TemplateGroup._group_actions
        }
        kwargs = {
            k: v
            for k, v in arg_dict.items()
            if v is not None and v not in ['true', 'false']
        }
        kwargs.update({k: True for k, v in arg_dict.items() if v == 'true'})
        kwargs.update({k: False for k, v in arg_dict.items() if v == 'false'})
        if options.file_format:
            kwargs.update({'project_format': options.file_format})
    elif options.export or options.output:
        cli.export(project_id,
                   output_file=options.output,
                   export_format=options.file_format)
    else:
        parser.print_help()
Exemplo n.º 14
0
def create(project_file,
           project_format=None,
           columnWidths=None,
           encoding=None,
           guessCellValueTypes=False,
           headerLines=None,
           ignoreLines=None,
           includeFileSources=False,
           limit=None,
           linesPerRow=None,
           processQuotes=True,
           projectName=None,
           projectTags=None,
           recordPath=None,
           separator=None,
           sheets=None,
           skipDataLines=None,
           storeBlankCellsAsNulls=True,
           storeBlankRows=True,
           storeEmptyStrings=True,
           trimStrings=False):
    """Create a new project from file."""
    # guess format from file extension
    if not project_format:
        project_format = os.path.splitext(project_file)[1][1:].lower()
        if project_format == 'txt':
            try:
                columnWidths[0]
                project_format = 'fixed-width'
            except TypeError:
                project_format = 'line-based'
    # defaults for each file type
    if project_format == 'xml':
        project_format = 'text/xml'
        if not recordPath:
            recordPath = [ElementTree.parse(project_file).getroot().tag]
    elif project_format == 'csv':
        project_format = 'text/line-based/*sv'
    elif project_format == 'tsv':
        project_format = 'text/line-based/*sv'
        if not separator:
            separator = '\t'
    elif project_format == 'line-based':
        project_format = 'text/line-based'
        if not skipDataLines:
            skipDataLines = -1
    elif project_format == 'fixed-width':
        project_format = 'text/line-based/fixed-width'
        if not headerLines:
            headerLines = 0
    elif project_format == 'json':
        project_format = 'text/json'
        if not recordPath:
            recordPath = ['_', '_']
    elif project_format == 'xls':
        project_format = 'binary/text/xml/xls/xlsx'
        if not sheets:
            sheets = [0]
            # TODO: new format for sheets option introduced in OpenRefine 2.8
    elif project_format == 'xlsx':
        project_format = 'binary/text/xml/xls/xlsx'
        if not sheets:
            sheets = [0]
            # TODO: new format for sheets option introduced in OpenRefine 2.8
    elif project_format == 'ods':
        project_format = 'text/xml/ods'
        if not sheets:
            sheets = [0]
            # TODO: new format for sheets option introduced in OpenRefine 2.8
    # execute
    kwargs = {k: v for k, v in vars().items() if v is not None}
    project = refine.Refine(refine.RefineServer()).new_project(
        guess_cell_value_types=guessCellValueTypes,
        ignore_lines=ignoreLines,
        header_lines=headerLines,
        skip_data_lines=skipDataLines,
        store_blank_rows=storeBlankRows,
        process_quotes=processQuotes,
        project_name=projectName,
        store_blank_cells_as_nulls=storeBlankCellsAsNulls,
        include_file_sources=includeFileSources,
        **kwargs)
    rows = project.do_json('get-rows')['total']
    if rows > 0:
        print('{0}: {1}'.format('id', project.project_id))
        print('{0}: {1}'.format('rows', rows))
        return project
    else:
        raise Exception(
            'Project contains 0 rows. Please check --help for mandatory '
            'arguments for xml, json, xlsx and ods')
Exemplo n.º 15
0
first, count, record = True, 0, ''
with open(DATA, 'rb') as inp:
    with open(PROCESSED, 'wb') as out:
        for line in inp:
            if first:
                first = False
                continue
            if count == 50:
                out.write(record.rstrip('#####')+'\n')
                count, record = 0, ''
            record += line.strip() + '#####'
            count += 1
        if record:
            out.write(record.rstrip('#####')+'\n')

# Now create the sample project using sampled data
sampling_ratio = 0.1
no_of_sampled_lines = int(sampling_ratio * no_of_lines)
subprocess.Popen(['cp', HEADER, SAMPLED]).communicate()
with open(SAMPLED, 'ab') as out:
    subprocess.Popen(['shuf', '-n {0}'.format(no_of_sampled_lines), \
        DATA], stdout=out).communicate()

server = refine.RefineServer()
refine = refine.Refine(server)
project = refine.new_project(project_file=SAMPLED, \
                             project_format=_format,\
                             project_options=_options)
print "Done"
print "Open: " +  project.project_url()
Exemplo n.º 16
0
def main():
    """Command line interface."""

    options, args = PARSER.parse_args()

    # set environment
    if options.host:
        refine.REFINE_HOST = options.host
    if options.port:
        refine.REFINE_PORT = options.port

    # get project_id
    if args and not str.isdigit(args[0]):
        projects = refine.Refine(refine.RefineServer()).list_projects().items()
        idlist = []
        for project_id, project_info in projects:
            if args[0].decode('UTF-8') == project_info['name']:
                idlist.append(str(project_id))
        if len(idlist) > 1:
            print('Error: Found %s projects with name %s.\n'
                  'Please specify project by id.' % (len(idlist), args[0]))
            for i in idlist:
                print('')
                cli.info(i)
            return
        else:
            try:
                project_id = idlist[0]
            except IndexError:
                print('Error: No project found with name %s.\n'
                      'Try command --list' % args[0])
                return
    elif args:
        project_id = args[0]

    # commands without args
    if options.list:
        cli.ls()
    elif options.download:
        cli.download(options.download, output_file=options.output)
    elif options.create:
        group5_dict = {
            group5_arg.dest: getattr(options, group5_arg.dest)
            for group5_arg in group5.option_list
        }
        kwargs = {
            k: v
            for k, v in group5_dict.items()
            if v is not None and v not in ['true', 'false']
        }
        kwargs.update({k: True for k, v in group5_dict.items() if v == 'true'})
        kwargs.update(
            {k: False
             for k, v in group5_dict.items() if v == 'false'})
        if options.file_format:
            kwargs.update({'project_format': options.file_format})
        cli.create(options.create, **kwargs)
    # commands with args
    elif args and options.info:
        cli.info(project_id)
    elif args and options.delete:
        cli.delete(project_id)
    elif args and options.apply:
        cli.apply(project_id, options.apply)
    elif args and options.template:
        group6_dict = {
            group6_arg.dest: getattr(options, group6_arg.dest)
            for group6_arg in group6.option_list
        }
        kwargs = {
            k: v
            for k, v in group6_dict.items()
            if v is not None and v not in ['true', 'false']
        }
        kwargs.update({k: True for k, v in group6_dict.items() if v == 'true'})
        kwargs.update(
            {k: False
             for k, v in group6_dict.items() if v == 'false'})
        cli.templating(project_id,
                       options.template,
                       output_file=options.output,
                       **kwargs)
    elif args and (options.export or options.output):
        cli.export(project_id,
                   output_file=options.output,
                   export_format=options.file_format)
    else:
        PARSER.print_usage()