def extract_affiliations(project_id, modified_only=False): """ Formats the raw output from Refine into an ADS-readable file. """ p = refine.RefineProject(SERVER, project_id=project_id) # Check the columns. if p.columns != ['Original affiliation', 'New affiliation', 'Original emails', 'New emails', 'Bibcodes and positions']: raise Exception('ERROR: Columns are not as expected.') rows = p.export(export_format='tsv') # Skip the first row that contains the column names. rows.next() for row in rows: row = UNICODE_HANDLER.u2ent(row[:-1].decode('utf_8')) original_aff, new_aff, original_emails, new_emails, bibcodes = row.split('\t') original = rebuild_affiliation(original_aff, original_emails) new = rebuild_affiliation(new_aff, new_emails) if modified_only and original == new: continue for bibcode in bibcodes.split(' '): bibcode, position = bibcode.split(',', 1) yield '%s\t%s\t%s' % (bibcode, position, new)
def test_open_export(self): fp = refine.RefineProject(self.project.project_url()).export() line = fp.next() self.assertTrue('email' in line) for line in fp: self.assertTrue('M' in line or 'F' in line) fp.close()
def export_csv_to_rdf(proj_name, input_file, encoding, json_file, output_file): """Exports CSV to RDF via OpenRefine using a JSON file.""" server = refine.RefineServer() LOGGER.debug("Connected to OpenRefine") options_json = get_options(proj_name, input_file) opts = {} new_style_options = dict(opts, **{ 'encoding': encoding, }) params = { 'options': json.dumps(new_style_options), } resp = server.urlopen('create-project-from-upload', options_json, params) url_params = urlparse.parse_qs(urlparse.urlparse(resp.geturl()).query) if 'project' in url_params: project_id = url_params['project'][0] LOGGER.debug("Created project with project id: %s", project_id) proj = refine.RefineProject(project_id) update_project_file(project_id) else: raise Exception('Project not created') apply_operations(proj, json_file) export_project(proj, output_file) if output_file: LOGGER.debug("RDF exported to: %s", output_file)
def test_open_export_csv(self): fp = refine.RefineProject(self.project.project_url()).export() csv_fp = csv.reader(fp, dialect='excel-tab') row = csv_fp.next() self.assertTrue(row[0] == 'email') for row in csv_fp: self.assertTrue(row[3] == 'F' or row[3] == 'M') fp.close()
def delete(project_id): """Delete project.""" project = refine.RefineProject(project_id) response = project.delete() if response != True: raise Exception('Failed to delete %s: %s' % (project_id, response)) else: print('Project %s has been successfully deleted' % project_id)
def delete(project_id): """Delete project.""" project = refine.RefineProject(project_id) response = project.delete() if response != True: raise Exception('Failed to delete {}: {}'.format(project_id, response)) else: print('Project {} has been successfully deleted'.format(project_id))
def apply(project_id, history_file): """Apply OpenRefine history from json file to project.""" project = refine.RefineProject(project_id) response = project.apply_operations(history_file) if response != 'ok': raise Exception('Failed to apply %s to %s: %s' % (history_file, project_id, response)) else: print('File %s has been successfully applied to project %s' % (history_file, project_id))
def deleteOR(refineproj=None): OPENREFINE_SERVER = current_app.config.get("OPENREFINE_SERVER") if not refineproj: refiner = refine.RefineProject(server=OPENREFINE_SERVER, project_id=int( self.openrefine_projectnumber)) else: refiner = refineproj refiner.delete() self.openrefine_projectnumber = "" self.save() return True
def update_project_file(project_id): """Write on file the project id.""" if not os.path.exists(LAST_PROJECT_FILE): update_file = open(LAST_PROJECT_FILE, 'w') update_file.write("%s\n" % project_id) update_file.close() else: update_file = open(LAST_PROJECT_FILE, 'r') last_project_created = update_file.readline().rstrip() update_file.close() LOGGER.debug("Deleting project id: %s", last_project_created) refine.RefineProject(last_project_created).delete() update_file = open(LAST_PROJECT_FILE, 'w') update_file.write("%s\n" % project_id) update_file.close()
def __init__(self, source=None): OPENREFINE_SERVER = current_app.config.get("OPENREFINE_SERVER") self.refine_server = refine.Refine(server=OPENREFINE_SERVER) #the source object will have the project_id of OR #if project_id does not exist then createOR #check that source exists and is good #bad URLS and bad files are not possible self.source = source if not source.ORid: self.refineproj = self.createOR(source) else: self.refineproj = refine.RefineProject(server=OPENREFINE_SERVER, project_id=str( int(self.source.ORid)))
def info(project_id): """Show project metadata""" projects = refine.Refine(refine.RefineServer()).list_projects() if project_id in projects.keys(): print('{0:>20}: {1}'.format('id', project_id)) print('{0:>20}: {1}'.format( 'url', 'http://' + refine.REFINE_HOST + ':' + refine.REFINE_PORT + '/project?project=' + project_id)) for k, v in projects[project_id].items(): if v: print(u'{0:>20}: {1}'.format(k, v)) project_model = refine.RefineProject(project_id).get_models() columns = [c['name'] for c in project_model['columnModel']['columns']] for (i, v) in enumerate(columns, start=1): print(u'{0:>20}: {1}'.format(u'column ' + str(i).zfill(3), v)) else: print('Error: No project found with id %s.\n' 'Check existing projects with command --list' % (project_id))
def export(project_id, encoding=None, output_file=None, export_format=None): """Dump a project to stdout or file.""" project = refine.RefineProject(project_id) if not export_format: export_format = 'tsv' if not output_file: if export_format in ['csv', 'tsv', 'txt']: encoding = 'UTF-8' sys.stdout.write( project.export(export_format=export_format, encoding=encoding).read()) else: ext = os.path.splitext(output_file)[1][1:] if ext: export_format = ext.lower() if export_format in ['csv', 'tsv', 'txt']: encoding = 'UTF-8' with open(output_file, 'wb') as f: f.write( project.export(export_format=export_format, encoding=encoding).read()) print('Export to file %s complete' % output_file)
def main(): """Main.""" options, args = PARSER.parse_args() if options.host: refine.REFINE_HOST = options.host if options.port: refine.REFINE_PORT = options.port if not options.list and len(args) != 1: PARSER.print_usage() if options.list: list_projects() if args: project = refine.RefineProject(args[0]) if options.apply: response = project.apply_operations(options.apply) if response != 'ok': print('Failed to apply %s: %s' % (options.apply, response), file=sys.stderr) if options.export: export_project(project, options) return project
def getToValue(computeCluster): result=[ max(list_of_dicts, key=lambda d: d['count']) for list_of_dicts in computeCluster ] chosenvaluelist=[] for chosendict in result: chosenvaluelist.append(chosendict['value']) return chosenvaluelist # @begin RenameColumn @desc Rename column name to make original table more meaningful # @in projectID # @in oldColumnName # @in newColumnName # @out table1 refine.RefineProject(refine.RefineServer(),projectID).rename_column('notes','commands') # @end RenameColumn # @begin OperationsColSponsor @desc OpenRefine operations on column sponsor # @in table1 # @in projectID # @in columnName:"sponsor" # @in expression:"value.trim()" # @in expression:"value.toLowercase()" # @in clusterer_type:"binning" # @in function:"ngram-fingerprint" # @in function:getToValue # @in function:getFromValue # @in params:"20" # @out table1-Sponsor
def main(): """Command line interface.""" # get environment variables in docker network docker_host = os.environ.get('OPENREFINE_SERVER_PORT_3333_TCP_ADDR') if docker_host: os.environ["OPENREFINE_HOST"] = docker_host refine.REFINE_HOST = docker_host docker_port = os.environ.get('OPENREFINE_SERVER_PORT_3333_TCP_PORT') if docker_port: os.environ["OPENREFINE_PORT"] = docker_port refine.REFINE_PORT = docker_port options, args = PARSER.parse_args() commands_dict = { group2_arg.dest: getattr(options, group2_arg.dest) for group2_arg in group2.option_list } commands_dict.update({ group3_arg.dest: getattr(options, group3_arg.dest) for group3_arg in group3.option_list }) commands_dict = {k: v for k, v in commands_dict.items() if v != None} if not commands_dict: PARSER.print_usage() return if options.host: refine.REFINE_HOST = options.host if options.port: refine.REFINE_PORT = options.port if args and not str.isdigit(args[0]): projects = refine.Refine(refine.RefineServer()).list_projects().items() idlist = [] for project_id, project_info in projects: if args[0] == project_info['name']: idlist.append(str(project_id)) if len(idlist) > 1: raise Exception( 'Found at least two projects. Please specify project by id.') else: args[0] = idlist[0] if options.list: list_projects() if options.create: create_project(options) if options.delete: project = refine.RefineProject(args[0]) project.delete() if options.apply: project = refine.RefineProject(args[0]) response = project.apply_operations(options.apply) if response != 'ok': print >> sys.stderr, 'Failed to apply %s: %s' \ % (options.apply, response) return project if options.export or options.output: project = refine.RefineProject(args[0]) export_project(project, options) return project if options.info: info(args[0]) project = refine.RefineProject(args[0]) return project
def templating(project_id, template, encoding='UTF-8', output_file=None, mode=None, prefix='', rowSeparator='\n', suffix='', filterQuery=None, filterColumn=None, facets=None, splitToFiles=False, suffixById=None): """Dump a project to stdout or file with templating.""" project = refine.RefineProject(project_id) # basic config templateconfig = { 'prefix': prefix, 'suffix': suffix, 'template': template, 'rowSeparator': rowSeparator, 'encoding': encoding } # construct the engine config if mode == 'record-based': engine = {'facets': [], 'mode': 'record-based'} else: engine = {'facets': [], 'mode': 'row-based'} if facets: engine['facets'].append(json.loads(facets)) if filterQuery: if not filterColumn: filterColumn = project.get_models()['columnModel']['keyColumnName'] textFilter = { 'type': 'text', 'name': filterColumn, 'columnName': filterColumn, 'mode': 'regex', 'caseSensitive': False, 'query': filterQuery } engine['facets'].append(textFilter) templateconfig.update({'engine': json.dumps(engine)}) if not splitToFiles: # normal output if not output_file: sys.stdout.write( project.export_templating(**templateconfig).read()) else: with open(output_file, 'wb') as f: f.write(project.export_templating(**templateconfig).read()) print('Export to file %s complete' % output_file) else: # splitToFiles functionality prefix = templateconfig['prefix'] suffix = templateconfig['suffix'] split = '===|||THISISTHEBEGINNINGOFANEWRECORD|||===' if not output_file: output_file = time.strftime('%Y%m%d') else: base = os.path.splitext(output_file)[0] ext = os.path.splitext(output_file)[1][1:] if not ext: ext = 'txt' # generate config for subfeature suffixById if suffixById: ids_template = ('{{forNonBlank(' + 'with(row.columnNames[0],cn,cells[cn].value),' + 'v,v,"")}}') ids_templateconfig = { 'engine': json.dumps(engine), 'template': ids_template, 'rowSeparator': '\n', 'encoding': encoding } ids = [ line.rstrip('\n') for line in project.export_templating(**ids_templateconfig) if line.rstrip('\n') ] # generate common config if mode == 'record-based': # record-based: split-character into template # if key column is not blank (=record) template = ('{{forNonBlank(' + 'with(row.columnNames[0],cn,cells[cn].value),' + 'v,"' + split + '", "")}}' + templateconfig['template']) templateconfig.update({ 'prefix': '', 'suffix': '', 'template': template, 'rowSeparator': '' }) else: # row-based: split-character into template template = split + templateconfig['template'] templateconfig.update({ 'prefix': '', 'suffix': '', 'template': template, 'rowSeparator': '' }) # execute records = project.export_templating( **templateconfig).read().split(split) del records[0] # skip first blank entry if suffixById: for index, record in enumerate(records): output_file = base + '_' + ids[index] + '.' + ext with open(output_file, 'wb') as f: f.writelines([prefix, record, suffix]) print('Export to files complete. Last file: %s' % output_file) else: zeros = len(str(len(records))) for index, record in enumerate(records): output_file = base + '_' + \ str(index + 1).zfill(zeros) + '.' + ext with open(output_file, 'wb') as f: f.writelines([prefix, record, suffix]) print('Export to files complete. Last file: %s' % output_file)