def getParser(self, fileHash, folder=None, original=False): """ returns a parser and stores cleaned file if not already available """ if folder: file_path = os.path.join(folder, fileHash) if os.path.exists(file_path): if folder in self.cleaned_folder.values(): return YACParser(filename=file_path, skip_guess_encoding=True) else: return YACParser(filename=file_path) else: if not original: for f in self.cleaned_folder: cleaned_path = os.path.join(self.cleaned_folder[f], fileHash) if os.path.exists(cleaned_path): return YACParser(filename=cleaned_path, skip_guess_encoding=True) for f in self.submit_folder: submit_path = os.path.join(self.submit_folder[f], fileHash) cleaned_path = os.path.join(self.cleaned_folder[f], fileHash) if os.path.exists(submit_path): table = YACParser(filename=submit_path) if not os.path.exists(cleaned_path): cleaned = table.generate() storeContent(cleaned, cleaned_path, md5=fileHash) return table return None
def test_string_column_labelling(self): tagger = GeoTagger('localhost', 27017) yacp = YACParser(filename='testdata/AdressenJHB.csv', sample_size=1800) tables = parseDataTables(yacp) t = tables[0] for i, row in enumerate(t.columnIter()): tagger.string_column(row)
def submit(self, file=None, url=None, content=None, toFolder=None): """ 1) retrieve and compute hash of original content 2) store submitted content using hash as filename IFF not exist optional URL as symlink :param file: :param url: :param content: :param toFolder: :return: the md5 of the original file """ if toFolder in self.submit_folder and toFolder in self.cleaned_folder: s_folder = self.submit_folder[toFolder] c_folder = self.cleaned_folder[toFolder] else: return None if file: md5 = storeFile(file, s_folder) elif url: md5 = storeURL(url, s_folder, max_file_size=self.max_file_size) elif content: md5 = storeContent(content, s_folder) else: return None # check if cleaned exists submitted_path=os.path.join(s_folder, md5) cleaned_path = os.path.join(c_folder, md5) # at first look for stored cleaned version if os.path.exists(cleaned_path): return md5 else: # generate and store cleaned version table = YACParser(filename=submitted_path) cleaned = table.generate() storeContent(cleaned, c_folder, md5=md5) return md5
def get_values(filename, col_id): # print filename tables = YACParser.from_source(filename=filename) values = [] errors = 0 for t in tables: if len(t.header_rows) > 0 and len(t.header_rows[0].cells) > col_id: header = t.header_rows[0].cells[col_id].value else: header = 'MISSING' for c in t.columns[col_id].cells: try: values.append(float(c.value)) except: errors += 1 break if errors > 0: print 'NUMBER OF ERRORS:', errors return header, values
def test_csv(self): client = MongoClient('localhost', 27017) tagger = OSMTagger(client) db = client.geostore q = db.geonames.find({'admin_level': 6, 'parent': "http://sws.geonames.org/2769848/", "country" : "http://sws.geonames.org/2782113/"}) r_tmp = [get_geonames_id(r['_id']) for r in q] regions = [] for r in r_tmp: regions.append(r) q = db.geonames.find({'admin_level': 8, 'parent': r, "country": "http://sws.geonames.org/2782113/"}) for sub_r in q: regions.append(get_geonames_id(sub_r['_id'])) yacp = YACParser(filename='testdata/AdressenJHB.csv', sample_size=1800) tables = parseDataTables(yacp) t = tables[0] for i, row in enumerate(t.columnIter()): tagger.label_values(row, regions)
def csvclean_service(url): ''' returns parsed table object from the YACParser ''' table = YACParser(url=url, sample_size=100) return table