def get_headers(fn): csvline = Csvline() with open(fn, "r") as f: return csvline.parse(f.readline())
"dwc:genus", "dwc:verbatimLocality", "dwc:catalogNumber", "dwc:eventDate", "dwc:recordedBy"] #fields = ["dwc:occurrenceID"] #fields = ["dwc:waterBody"] #fields = headers out_dir = "out{0}_{1}".format(raw, recordset) if not os.path.exists(out_dir): os.makedirs(out_dir) sc = SparkContext(appName="UniqueCSVline") csvline = Csvline() # filter removes header line which is going to be unique records = sc.textFile(fn) first_line = records.take(1)[0] records = records.filter(lambda line: line != first_line) parsed = records.map(lambda x: csvline.parse(x.encode("utf8"), headers) ) parsed.cache() # most fields have ":", some are URLs too in the raw data, make them usable # as a file name. p = re.compile('[\W_]+') for field in fields: out_fn = "{0}/unique_{1}.csv".format(out_dir, p.sub("_", field)) if os.path.exists(out_fn):