def setUp(self): project_id = create.create_refine_project( TEST_DATA, 'Test project (can be safely removed).') # We need to reopen the project in order to force the refresh after # applying the JSON operations. server = refine.Refine(create.SERVER) self.project = server.open_project(project_id)
def create_refine_project(path, name, pretend=False, verbose=0): """ Creates a project in google Refine and loads the affiliations. """ input_file = os.path.abspath(path) msg('Create a file that we can upload to Refine.', verbose) new_input_file = clean_ads_affs(input_file, verbose) msg('Upload to Refine.', verbose) project_name = 'Astronomy affiliations (%s) (created %s)' % (os.path.basename(path).replace('.reversed', '.merged'), time.asctime()) print 'Creating project "%s".' % project_name if not pretend: r = refine.Refine(SERVER) project = r.new_project(project_file=new_input_file, project_name=project_name, split_into_columns=True, separator='\t', ignore_initial_non_blank_lines=0, header_lines=1, skip_initial_data_rows=0, limit=0, guess_value_type=False, ignore_quotes=False) msg('Done with success.', verbose) return project.project_id
def setUp(self): project_id = create.create_refine_project( TEST_DATA, 'Test project (can be safely removed).') # We need to reopen the project in order to force the refresh after # applying the JSON operations. server = refine.Refine(create.SERVER) self.project = server.open_project(project_id) # Perform a few edits. ## Modify an affiliation. self.project.edit( 'New affiliation', 'Astronomical Institute "Anton Pannekoek", University of Amsterdam, Kruislaan 403, NL--1098 SJ Amsterdam, The Netherlands', 'Astronomical Institute "Anton Pannekoek"') ## Remove an affiliation. self.project.edit('New affiliation', 'San Cosme y Damian, Paraguay', '') ## Modify an email. self.project.edit('New emails', """[u'*****@*****.**"']""", "[u'*****@*****.**']") ## Remove an email. self.project.edit('New emails', "[u'*****@*****.**']", '') self.project.edit('New emails', "[u'*****@*****.**']", '[]') # Grab the affiliations. self.affs = export.format_affiliations(project_id)
def setUp(self): self.server = refine.RefineServer() self.refine = refine.Refine(self.server) if self.project_file: self.project = self.refine.new_project( project_file=self.project_file, project_format=self.project_format, **self.project_options)
def set_up(self): self.server = refine.RefineServer() self.refine = refine.Refine(self.server) if self.project_file: self.project = self.refine.new_project( project_file=self.project_path(), project_format=self.project_format, separator='|', **self.project_options)
def info(project_id): """Show project metadata""" projects = refine.Refine(refine.RefineServer()).list_projects().items() for projects_id, projects_info in projects: if project_id == projects_id: print('{0}: {1}'.format('id', projects_id)) print('{0}: {1}'.format('name', projects_info['name'])) print('{0}: {1}'.format('created', projects_info['created'])) print('{0}: {1}'.format('modified', projects_info['modified']))
def list_projects(): """Query the Refine server and list projects by ID: name.""" projects = refine.Refine(refine.RefineServer()).list_projects().items() def date_to_epoch(json_dt): """Convert a JSON date time into seconds-since-epoch.""" return time.mktime(time.strptime(json_dt, '%Y-%m-%dT%H:%M:%SZ')) projects.sort(key=lambda v: date_to_epoch(v[1]['modified']), reverse=True) for project_id, project_info in projects: print('{0:>14}: {1}'.format(project_id, project_info['name']))
def connect(self): try: refine.REFINE_HOST = settings.GOOGLE_REFINE_HOST refine.REFINE_PORT = settings.GOOGLE_REFINE_PORT except AttributeError: raise CommandError( 'Set GOOGLE_REFINE_HOST and GOOGLE_REFINE_PORT in settings') try: server = refine.RefineServer() refine_instance = refine.Refine(server) server.get_version() except URLError: raise CommandError('Google Refine server is not reachable.') return refine_instance
def ls(): """Query the server and list projects sorted by mtime.""" projects = refine.Refine(refine.RefineServer()).list_projects().items() def date_to_epoch(json_dt): """Convert a JSON date time into seconds-since-epoch.""" return time.mktime(time.strptime(json_dt, '%Y-%m-%dT%H:%M:%SZ')) projects.sort(key=lambda v: date_to_epoch(v[1]['modified']), reverse=True) if projects: for project_id, project_info in projects: print(u'{0:>14}: {1}'.format(project_id, project_info['name']).encode('utf-8')) else: print('Error: No projects found')
def __init__(self, source=None): OPENREFINE_SERVER = current_app.config.get("OPENREFINE_SERVER") self.refine_server = refine.Refine(server=OPENREFINE_SERVER) #the source object will have the project_id of OR #if project_id does not exist then createOR #check that source exists and is good #bad URLS and bad files are not possible self.source = source if not source.ORid: self.refineproj = self.createOR(source) else: self.refineproj = refine.RefineProject(server=OPENREFINE_SERVER, project_id=str( int(self.source.ORid)))
def latest_ast_affiliations_project_id(): """ Returns the project id of the latest astronomy affiliations project. """ server = refine.Refine(SERVER) name_pattern = re.compile('affils.ast.\d{8}_\d{4}') latest_project = ('', None) for id, properties in server.list_projects().items(): match = name_pattern.search(properties['name']) if match is not None: file_name = match.group() if file_name > latest_project[0]: latest_project = (file_name, id) print 'Extracting from project "%s".' % latest_project[1] return latest_project[1]
def info(project_id): """Show project metadata""" projects = refine.Refine(refine.RefineServer()).list_projects() if project_id in projects.keys(): print('{0:>20}: {1}'.format('id', project_id)) print('{0:>20}: {1}'.format( 'url', 'http://' + refine.REFINE_HOST + ':' + refine.REFINE_PORT + '/project?project=' + project_id)) for k, v in projects[project_id].items(): if v: print(u'{0:>20}: {1}'.format(k, v)) project_model = refine.RefineProject(project_id).get_models() columns = [c['name'] for c in project_model['columnModel']['columns']] for (i, v) in enumerate(columns, start=1): print(u'{0:>20}: {1}'.format(u'column ' + str(i).zfill(3), v)) else: print('Error: No project found with id %s.\n' 'Check existing projects with command --list' % (project_id))
def testORLoad(sourceurl=None, fileobj=None): #download source data and save to temp file #add checks for a valid URL or file path if not sourceurl and not fileobj: print "You're missing the sourceurl or the fileobj" if sourceurl: res = requests.get(sourceurl) datatext = res.text elif fileobj: with codecs.open(fileobj, 'rb') as datafile: datatext = datafile.read() else: print "something went wrong with finding sourceurl or fileobj" filepath = os.path.join(tempfile.gettempdir(), str(int(time.time())) + ".csv").replace("\\", "/") with codecs.open(filepath, 'wb', 'utf-8') as f: f.write(datatext) OPENREFINE_SERVER = current_app.config.get("OPENREFINE_SERVER") #store raw file here with barn try: refine_server = refine.Refine(server=OPENREFINE_SERVER) refineproj = refine_server.new_project( project_file=filepath, project_name="testerhere", separator=',', #store_blank_rows=True, #store_blank_cells_as_nulls=True ) except Exception, e: print "hit error on project creation" print e os.remove(filepath) return False
def create(project_file, project_format=None, columnWidths=None, encoding=None, guessCellValueTypes=False, headerLines=None, ignoreLines=None, includeFileSources=False, limit=None, linesPerRow=None, processQuotes=True, projectName=None, projectTags=None, recordPath=None, separator=None, sheets=None, skipDataLines=None, storeBlankCellsAsNulls=True, storeBlankRows=True, storeEmptyStrings=True, trimStrings=False): """Create a new project from file.""" # guess format from file extension if not project_format: project_format = os.path.splitext(project_file)[1][1:].lower() if project_format == 'txt': try: columnWidths[0] project_format = 'fixed-width' except TypeError: project_format = 'line-based' # defaults for each file type if project_format == 'xml': project_format = 'text/xml' if not recordPath: recordPath = [ElementTree.parse(project_file).getroot().tag] elif project_format == 'csv': project_format = 'text/line-based/*sv' elif project_format == 'tsv': project_format = 'text/line-based/*sv' if not separator: separator = '\t' elif project_format == 'line-based': project_format = 'text/line-based' if not skipDataLines: skipDataLines = -1 elif project_format == 'fixed-width': project_format = 'text/line-based/fixed-width' if not headerLines: headerLines = 0 elif project_format == 'json': project_format = 'text/json' if not recordPath: recordPath = ['_', '_'] elif project_format == 'xls': project_format = 'binary/text/xml/xls/xlsx' if not sheets: sheets = [0] # TODO: new format for sheets option introduced in OpenRefine 2.8 elif project_format == 'xlsx': project_format = 'binary/text/xml/xls/xlsx' if not sheets: sheets = [0] # TODO: new format for sheets option introduced in OpenRefine 2.8 elif project_format == 'ods': project_format = 'text/xml/ods' if not sheets: sheets = [0] # TODO: new format for sheets option introduced in OpenRefine 2.8 # execute kwargs = {k: v for k, v in vars().items() if v is not None} project = refine.Refine(refine.RefineServer()).new_project( guess_cell_value_types=guessCellValueTypes, ignore_lines=ignoreLines, header_lines=headerLines, skip_data_lines=skipDataLines, store_blank_rows=storeBlankRows, process_quotes=processQuotes, project_name=projectName, store_blank_cells_as_nulls=storeBlankCellsAsNulls, include_file_sources=includeFileSources, **kwargs) rows = project.do_json('get-rows')['total'] if rows > 0: print('{0}: {1}'.format('id', project.project_id)) print('{0}: {1}'.format('rows', rows)) return project else: raise Exception( 'Project contains 0 rows. Please check --help for mandatory ' 'arguments for xml, json, xlsx and ods')
# @in to:['Waldorf Astoria','Hamburg Amerika Linie','Norddeutscher Lloyd Bremen'] # @in Json_History_id # @in function:getToValue # @in function:getFromValue # @out outputFile @uri file: PartTest.tsv # @out projectID # @out projectNoRows # @begin CreateProject @desc create project from file # @in csvFile @uri file: partTest.csv # @in refinePythonFile @uri file: refine.py # @out projectID # @out projectNoRows from google.refine import refine projectID=refine.Refine(refine.RefineServer()).new_project('partTest.csv','HalfMenuDataset','.csv')[1] # print(refine.myParser('--list')) # @end CreateProject '''insert a function to automatically get 'from' ''' def getFromValue(computeCluster): fromlist=[] fromlistInner=[] for list3 in computeCluster: for list4 in list3: fromlistInner.append(list4['value']) fromlist.append(fromlistInner) fromlistInner=[] return fromlist
def main(): """Command line interface.""" # get environment variables in docker network docker_host = os.environ.get('OPENREFINE_SERVER_PORT_3333_TCP_ADDR') if docker_host: os.environ["OPENREFINE_HOST"] = docker_host refine.REFINE_HOST = docker_host docker_port = os.environ.get('OPENREFINE_SERVER_PORT_3333_TCP_PORT') if docker_port: os.environ["OPENREFINE_PORT"] = docker_port refine.REFINE_PORT = docker_port options, args = PARSER.parse_args() commands_dict = { group2_arg.dest: getattr(options, group2_arg.dest) for group2_arg in group2.option_list } commands_dict.update({ group3_arg.dest: getattr(options, group3_arg.dest) for group3_arg in group3.option_list }) commands_dict = {k: v for k, v in commands_dict.items() if v != None} if not commands_dict: PARSER.print_usage() return if options.host: refine.REFINE_HOST = options.host if options.port: refine.REFINE_PORT = options.port if args and not str.isdigit(args[0]): projects = refine.Refine(refine.RefineServer()).list_projects().items() idlist = [] for project_id, project_info in projects: if args[0] == project_info['name']: idlist.append(str(project_id)) if len(idlist) > 1: raise Exception( 'Found at least two projects. Please specify project by id.') else: args[0] = idlist[0] if options.list: list_projects() if options.create: create_project(options) if options.delete: project = refine.RefineProject(args[0]) project.delete() if options.apply: project = refine.RefineProject(args[0]) response = project.apply_operations(options.apply) if response != 'ok': print >> sys.stderr, 'Failed to apply %s: %s' \ % (options.apply, response) return project if options.export or options.output: project = refine.RefineProject(args[0]) export_project(project, options) return project if options.info: info(args[0]) project = refine.RefineProject(args[0]) return project
def create_project(options): """Create a new project from options.create file.""" # general defaults are defined in google_refine/refine/refine.py new_project # additional defaults for each file type defaults = {} defaults['xml'] = {'project_format': 'text/xml', 'recordPath': 'record'} defaults['csv'] = { 'project_format': 'text/line-based/*sv', 'separator': ',' } defaults['tsv'] = { 'project_format': 'text/line-based/*sv', 'separator': '\t' } defaults['line-based'] = { 'project_format': 'text/line-based', 'skipDataLines': -1 } defaults['fixed-width'] = { 'project_format': 'text/line-based/fixed-width', 'headerLines': 0 } defaults['json'] = { 'project_format': 'text/json', 'recordPath': ('_', '_') } defaults['xls'] = { 'project_format': 'binary/text/xml/xls/xlsx', 'sheets': 0 } defaults['xlsx'] = { 'project_format': 'binary/text/xml/xls/xlsx', 'sheets': 0 } defaults['ods'] = {'project_format': 'text/xml/ods', 'sheets': 0} # guess format from file extension (or legacy option --format) input_format = os.path.splitext(options.create)[1][1:].lower() if input_format == 'txt' and options.columnWidths: input_format = 'fixed-width' if input_format == 'txt' and not options.columnWidths: input_format = 'line-based' if options.input_format: input_format = options.input_format # defaults for selected format input_dict = defaults[input_format] # user input input_user = { group4_arg.dest: getattr(options, group4_arg.dest) for group4_arg in group4.option_list } input_user['strings'] = { k: v for k, v in input_user.items() if v != None and v not in ['true', 'false'] } input_user['trues'] = { k: True for k, v in input_user.items() if v == 'true' } input_user['falses'] = { k: False for k, v in input_user.items() if v == 'false' } input_user_eval = input_user['strings'] input_user_eval.update(input_user['trues']) input_user_eval.update(input_user['falses']) # merge defaults with user input input_dict.update(input_user_eval) input_dict['project_file'] = options.create refine.Refine(refine.RefineServer()).new_project(**input_dict)
def main(): """Command line interface.""" options, args = PARSER.parse_args() # set environment if options.host: refine.REFINE_HOST = options.host if options.port: refine.REFINE_PORT = options.port # get project_id if args and not str.isdigit(args[0]): projects = refine.Refine(refine.RefineServer()).list_projects().items() idlist = [] for project_id, project_info in projects: if args[0].decode('UTF-8') == project_info['name']: idlist.append(str(project_id)) if len(idlist) > 1: print('Error: Found %s projects with name %s.\n' 'Please specify project by id.' % (len(idlist), args[0])) for i in idlist: print('') cli.info(i) return else: try: project_id = idlist[0] except IndexError: print('Error: No project found with name %s.\n' 'Try command --list' % args[0]) return elif args: project_id = args[0] # commands without args if options.list: cli.ls() elif options.download: cli.download(options.download, output_file=options.output) elif options.create: group5_dict = { group5_arg.dest: getattr(options, group5_arg.dest) for group5_arg in group5.option_list } kwargs = { k: v for k, v in group5_dict.items() if v is not None and v not in ['true', 'false'] } kwargs.update({k: True for k, v in group5_dict.items() if v == 'true'}) kwargs.update( {k: False for k, v in group5_dict.items() if v == 'false'}) if options.file_format: kwargs.update({'project_format': options.file_format}) cli.create(options.create, **kwargs) # commands with args elif args and options.info: cli.info(project_id) elif args and options.delete: cli.delete(project_id) elif args and options.apply: cli.apply(project_id, options.apply) elif args and options.template: group6_dict = { group6_arg.dest: getattr(options, group6_arg.dest) for group6_arg in group6.option_list } kwargs = { k: v for k, v in group6_dict.items() if v is not None and v not in ['true', 'false'] } kwargs.update({k: True for k, v in group6_dict.items() if v == 'true'}) kwargs.update( {k: False for k, v in group6_dict.items() if v == 'false'}) cli.templating(project_id, options.template, output_file=options.output, **kwargs) elif args and (options.export or options.output): cli.export(project_id, output_file=options.output, export_format=options.file_format) else: PARSER.print_usage()
def main(): r = refine.Refine(SERVER) for id, d1 in r.list_projects().items(): if d1['name'].startswith('Test project'): r.open_project(id).delete()
def main(): """Command line interface.""" options = parser.parse_args() # set environment if options.host: refine.REFINE_HOST = options.host if options.port: refine.REFINE_PORT = options.port # get project_id if options.project_id and str.isdigit(options.project_id): project_id = options.project_id elif options.project_id: projects = refine.Refine(refine.RefineServer()).list_projects().items() idlist = [] for project_id, project_info in projects: if options.project_id == project_info['name']: idlist.append(str(project_id)) if len(idlist) > 1: print('Error: Found {idlist} projects with name {name}.\n' 'Please specify project by id.'.format( idlist=len(idlist), name=options.project_id)) for i in idlist: print('') cli.info(i) return else: try: project_id = idlist[0] except IndexError: print('Error: No project found with name {}.\n' 'Try command --list'.format(options.project_id)) return # commands without args if options.list: cli.ls() elif options.download: cli.download(options.download, output_file=options.output) elif options.create: arg_dict = { arg.dest: getattr(options, arg.dest) for arg in CreateGroup._group_actions } kwargs = { k: v for k, v in arg_dict.items() if v is not None and v not in ['true', 'false'] } kwargs.update({k: True for k, v in arg_dict.items() if v == 'true'}) kwargs.update({k: False for k, v in arg_dict.items() if v == 'false'}) if options.file_format: kwargs.update({'project_format': options.file_format}) cli.create(options.create, **kwargs) # commands with args elif options.info: cli.info(project_id) elif options.delete: cli.delete(project_id) elif options.apply: cli.apply(project_id, options.apply) elif options.template: arg_dict = { arg.dest: getattr(options, arg.dest) for arg in TemplateGroup._group_actions } kwargs = { k: v for k, v in arg_dict.items() if v is not None and v not in ['true', 'false'] } kwargs.update({k: True for k, v in arg_dict.items() if v == 'true'}) kwargs.update({k: False for k, v in arg_dict.items() if v == 'false'}) if options.file_format: kwargs.update({'project_format': options.file_format}) elif options.export or options.output: cli.export(project_id, output_file=options.output, export_format=options.file_format) else: parser.print_help()
def get_refine_ws(): """Returns a new Google Refine workspace.""" server_url = get_refine_server_url() return refine.Refine(server_url)
first, count, record = True, 0, '' with open(DATA, 'rb') as inp: with open(PROCESSED, 'wb') as out: for line in inp: if first: first = False continue if count == 50: out.write(record.rstrip('#####')+'\n') count, record = 0, '' record += line.strip() + '#####' count += 1 if record: out.write(record.rstrip('#####')+'\n') # Now create the sample project using sampled data sampling_ratio = 0.1 no_of_sampled_lines = int(sampling_ratio * no_of_lines) subprocess.Popen(['cp', HEADER, SAMPLED]).communicate() with open(SAMPLED, 'ab') as out: subprocess.Popen(['shuf', '-n {0}'.format(no_of_sampled_lines), \ DATA], stdout=out).communicate() server = refine.RefineServer() refine = refine.Refine(server) project = refine.new_project(project_file=SAMPLED, \ project_format=_format,\ project_options=_options) print "Done" print "Open: " + project.project_url()