def __init__(self, inputFilename, delimiter, sort_col, logname): """ @param inputFilename: full pathname of a CSV file containing records uniquely identified by the first field in a record. @param delimiter: separator between fields of a record @param log: a logger to write debugging messages to a file @note: Original data file is composed of ~100 chunks of records, each chunk sorted on the gbif ID, in position 0 of each record. """ self.messyfile = inputFilename self.delimiter = delimiter self.sort_col = sort_col try: sort_idx = int(sort_col) self.sort_idx = sort_idx except: self.sort_idx = None self.header = [] tmp, _ = os.path.splitext(self.messyfile) self._basepath, self._dataname = os.path.split(tmp) logfname = os.path.join(pth, '{}.log'.format(logname)) self._log = get_logger(logname, logfname) self.pth = pth self.splitBase = os.path.join(pth, 'split_{}'.format(self._dataname)) self.tidyfile = os.path.join(pth, 'tidy_{}.csv'.format(self._dataname)) self._files = {}
def intersect_csv_and_shapefiles(in_csv_filename, geodata1, geodata2, ancillary_path, out_csv_filename, from_gbif): """Intersect the records in the csv file with the two provided shapefiles. Args: csv_filename (str): Path to a CSV file of records. shapefile_1_filename (str): Path to the first shapefile to check for intersection. shapefile_2_filename (str): Path to the second shapefile to check for intersection. out_csv_filename (str): Path for output CSV records. """ pth, basefname = os.path.split(out_csv_filename) logbasename, _ = os.path.splitext(basefname) logfname = os.path.join(pth, '{}.log'.format(logbasename)) logger = get_logger(logbasename, logfname) bf = BisonFiller(log=logger) # Pass 4 of CSV transform, final step, point-in-polygon intersection bf.update_point_in_polygons(geodata1, geodata2, ancillary_path, in_csv_filename, out_csv_filename, from_gbif=from_gbif) # Do intersection here sleep(randint(0, 10)) print(' - {}'.format(out_csv_filename))
def __init__(self, infname, indelimiter, group_col, logname): """Split a large CSV file into individual files grouped by one column. Args: infname: full pathname to a CSV file containing records to be grouped on the value in a field of the records indelimiter: separator between fields of a record group_col: the column name (for files with a header) or column index for the field to be used for grouping logname: the basename for a message logger """ self.messyfile = infname self.indelimiter = indelimiter self.group_col = group_col self.header = self._get_header() self.group_idx = None try: self.group_idx = int(group_col) except: try: self.group_idx = self.header.index(group_col) except: raise Exception('Field {} does not exist in header {}'.format( self.group_col, self.header)) tmp, _ = os.path.splitext(self.messyfile) self._basepath, self._dataname = os.path.split(tmp) logfname = os.path.join(pth, '{}.log'.format(logname)) self._log = get_logger(logname, logfname) self.pth = pth self._files = {}
def reset_logger(self, outdir, logname): self._log = None logfname = os.path.join(outdir, '{}.log'.format(logname)) logger = get_logger(logname, logfname) self._log = logger