def initialize(self): self.stream = ds.CSVDataSource(self.resource, *self.args, **self.kwargs) if self.fields: self.stream.fields = self.fields self.stream.initialize() # FIXME: this is experimental form of usage self._output_fields = self.stream.fields.copy() self._output_fields.retype(self._retype_dictionary)
def transform(self): handle = urllib2.urlopen(self.url) src = ds.CSVDataSource(handle, encoding=self.encoding, dialect=self.dialect) src.initialize() result = self.read_source_rows(src) handle.close() return result
def transform(self): handle = urllib2.urlopen(self.url) if not self.dialect: if self.url.endswith('.tsv'): self.dialect = 'excel-tab' else: self.dialect = 'excel' src = ds.CSVDataSource(handle, encoding=self.encoding, dialect=self.dialect) src.initialize() result = self.read_source_rows(src) handle.close() return result
all_fields.append(field) # Create and initialize a data target out = ds.CSVDataTarget("merged.csv") out.fields = brewery.FieldList(all_fields) out.initialize() # Append all sources for source in sources: path = source["file"] # Initialize data source: skip reading of headers - we are preparing them ourselves # use XLSDataSource for XLS files # We ignore the fields in the header, because we have set-up fields # previously. We need to skip the header row. src = ds.CSVDataSource(path, read_header=False, skip_rows=1) src.fields = ds.FieldList(source["fields"]) src.initialize() for record in src.records(): # Add file reference into ouput - to know where the row comes from record["file"] = path out.append(record) # Close the source stream src.finalize()
def gtedgePull(gtedgeInput,profs,consts): # Automatically generates .csv file of pyinput.txt files # to provide GTEDGE input values os.chdir(gtedgeInput) pyinputs=glob.glob('pyinput*') pyinputsConst=[] pyinputsProfs=[] sourcesConst=[] sourcesProfs=[] tempVarsConst=[] tempVarsProfs=[] varsConst=[] varsProfs=[] varNamesConst=[] varNamesProfs=[] constHeaders=[] profsHeaders=[] firstLine=True for filename in pyinputs: with open(filename) as openfile: num_lines=sum(1 for line in openfile) openfile.seek(0) if num_lines==2: for line in openfile: if firstLine==True: line=line.strip() varNamesConst.append(line.split(',')) constHeaders.append(line.split(',')) firstLine=False else: tempVarsConst.append(line.split()) varsConst.append(line.split()) sourcesConst.append({"file":filename+".csv","fields":varNamesConst[0]}) else: for line in openfile: if firstLine==True: line=line.strip() varNamesProfs.append(line.split(',')) profsHeaders.append(line.split(',')) firstLine=False else: tempVarsProfs.append(line.split()) varsProfs.append(line.split()) sourcesProfs.append({"file":filename+".csv","fields":varNamesProfs[0]}) firstLine=True with open(filename+'.csv','wb') as csvfile: if num_lines==2: csvWriter=csv.writer(csvfile,delimiter=',') csvWriter.writerow(varNamesConst[0]) for a in tempVarsConst: csvWriter.writerow(a) varNamesConst=[] tempVarsConst=[] else: csvWriter=csv.writer(csvfile,delimiter=',') csvWriter.writerow(varNamesProfs[0]) for a in tempVarsProfs: csvWriter.writerow(a) varNamesProfs=[] tempVarsProfs=[] all_fields_Const=brewery.FieldList(["file"]) for source in sourcesConst: for field in source["fields"]: pass # if field not in all_fields_Const.fields(): # all_fields_Const.append(field) breweryConstOut=ds.CSVDataTarget(gtedgeInput+"_const.csv",) breweryConstOut.fields=brewery.FieldList(source["fields"]) breweryConstOut.initialize() for source in sourcesConst: path=source["file"] src=ds.CSVDataSource(path,read_header=True,skip_rows=0) src.fields=ds.FieldList(source["fields"]) src.initialize() breweryConstOut.field_names=(source["fields"]) for record in src.records(): record["file"]=path breweryConstOut.append(record) src.finalize() breweryConstOut.finalize() breweryConstOut.close_file all_fields_Profs=brewery.FieldList(["file"]) for source in sourcesProfs: for field in source["fields"]: if field not in all_fields_Const: all_fields_Profs.append(field) breweryProfsOut=ds.CSVDataTarget(gtedgeInput+"_profs_temp.csv") breweryProfsOut.fields=brewery.FieldList(all_fields_Profs) breweryProfsOut.initialize() for source in sourcesProfs: path=source["file"] src=ds.CSVDataSource(path,read_header=False,skip_rows=1) src.fields=ds.FieldList(source["fields"]) src.initialize() for record in src.records(): record["file"]=path breweryProfsOut.append(record) src.finalize() breweryProfsOut.finalize() breweryProfsOut.close_file in_file=open(gtedgeInput+"_profs_temp.csv") out_file=open(gtedgeInput+"_profs.csv","wb+") for line in in_file: line=re.sub('\,+',',',line) out_file.write(line) out_file.close() # profsHeaders=sum(profsHeaders,[]) # all_fields=brewery.FieldList(profsHeaders) # breweryProfsOut=brewery.ds.CSVDataTarget(gtedgeInput+"_profs.csv") # breweryProfsOut.fields=all_fields # breweryProfsOut.initialize() # for source in pyinputsProfs: # src=brewery.ds.CSVDataSource(source,read_header=False,skip_rows=0) # src.fields=brewery.ds.FieldList(profsHeaders) # src.initialize() # for record in varsProfs: # breweryProfsOut.append(record) # src.finalize() # breweryProfsOut.finalize() # breweryProfsOut.close_file # newConst=open(gtedgeInput+"_consts.csv","wb+") # headers=[] # for file in pyinputsCons: # f=open(file) # headers.append(f.read().split(,)) os.chdir(os.pardir) file_path_profs=os.path.relpath(profs) file_path_const=os.path.relpath(consts) f_profs=open(file_path_profs,'r') f_consts=open(file_path_const,'r')
import sys import brewery.ds as ds import brewery.dq as dq from chardet.universaldetector import UniversalDetector filename = sys.argv[1] detector = UniversalDetector() for line in file(filename, 'rb'): detector.feed(line) if detector.done: break detector.close() src = ds.CSVDataSource(filename, read_header=True, encoding=detector.result["encoding"], delimiter=',') src.initialize() if len(src.field_names) == 1: src.finalize() src = ds.CSVDataSource(filename, read_header=True, encoding=detector.result["encoding"], delimiter=';') src.initialize() out = ds.CSVDataTarget(sys.stdout, encoding='utf-8') out.fields = ds.fieldlist(src.field_names) out.initialize() for record in src.records(): out.append(record)