def _openForReadWrite(self, fname, header=None): ''' @summary: Read and populate lookup table if file exists, open and return file and csvwriter for writing or appending. If lookup file is new, write header if provided. ''' lookupDict = {} doAppend = False if os.path.exists(fname): doAppend = True try: csvRdr, infile = getCSVReader(fname, DELIMITER) # get header line, recno = self.getLine(csvRdr, 0) # read lookup vals into dictionary while (line is not None): line, recno = self.getLine(csvRdr, recno) if line and len(line) > 0: try: # First item is dict key, rest are vals lookupDict[line[0]] = line[1:] except Exception, e: self._log.warn( 'Failed to read line {} from {}'.format( recno, fname)) self._log.info('Read lookup file {}'.format(fname)) finally:
def concatenateLookups(filepath, outfname, pattern=None, fnames=None): ''' @summary: Concatenate named files or files matching pattern into a single file. @param filepath: Pathname to input files @param outfname: Basename of output file @param pattern: Pattern to match for input files @param fnames: Basename of one or more input file ''' outfname = os.path.join(filepath, outfname) infnames = [] try: csvwriter, outf = getCSVWriter(outfname, DELIMITER) if pattern is not None: infnames = glob.glob(os.path.join(filepath, pattern)) if fnames is not None: for fn in fnames: infnames.append(os.path.join(filepath, fn)) for fname in infnames: csvreader, inf = getCSVReader(fname, DELIMITER) while csvreader is not None: try: line = csvreader.next() except OverflowError, e: print( 'Overflow on line {} ({})'.format(csvreader.line, str(e))) except StopIteration: print('EOF after line {}'.format(csvreader.line_num)) csvreader = None inf.close() except Exception, e: print('Bad record on line {} ({})'.format(csvreader.line_num, e))
def _writeGBIFParserInput(self, inNameIdFname, outScinameFname): ''' @summary: Read scientificName, taxonKey(s) from input CSV file, write a JSON file with scientificName list for input to the GBIF parser. @param inNameIdFname: Input CSV file with scientificName, and one or more taxonKeys (identifier for GBIF taxonomic record). @param outScinameFname: Output JSON file with list of ScientificNames for parsing by the GBIF parser. ''' try: scif = open(outScinameFname, 'wb') scif.write('[{}'.format(NEWLINE)) csvreader, inf = getCSVReader(inNameIdFname, DELIMITER) # discard header _, csvreader = self._readData(csvreader) # then get/write first line encSciname, csvreader = self._readData(csvreader) self._writeData(scif, encSciname) while csvreader is not None: encSciname, csvreader = self._readData(csvreader) if encSciname is not None: scif.write(',{}'.format(NEWLINE)) self._writeData(scif, encSciname) scif.write('{}]{}'.format(NEWLINE, NEWLINE)) finally: scif.close() inf.close()
def splitFile(bigFname, limit=50000): currFnum = 1 stopLine = limit csvreader, inf = getCSVReader(bigFname, DELIMITER) csvwriter, outf = _getNextWriter(bigFname, currFnum) while csvreader is not None and csvreader.line_num < stopLine: try: line = csvreader.next() except OverflowError, e: print( 'Overflow on line {} ({})'.format(csvreader.line, str(e))) except StopIteration: print('EOF after line {}'.format(csvreader.line_num)) csvreader = None inf.close()
def openInputOutput(self): ''' @summary: Read GBIF metadata, open GBIF interpreted data for reading, output file for writing ''' self.fldMeta = self.getFieldMeta() (self._iCsvrdr, self._if) = getCSVReader(self.interpFname, DELIMITER) (self._outWriter, self._outf) = getCSVWriter(self.outFname, DELIMITER, doAppend=False) # Write the header row self._outWriter.writerow(ORDERED_OUT_FIELDS) self._log.info('Opened input/output files')
def _openReader(self, fname): if not os.path.exists(fname): raise Exception('Input file {} does not exist'.format(fname)) reader, outf = getCSVReader(fname, self.delimiter) self._openfiles[fname] = outf return reader
def _getHeader(self): reader, inf = getCSVReader(self.messyfile, self.delimiter) header = next(reader) inf.close() return header