예제 #1
0
    def getParser(self, fileHash, folder=None, original=False):
        """
        returns a parser and stores cleaned file if not already available
        """
        if folder:
            file_path = os.path.join(folder, fileHash)
            if os.path.exists(file_path):
                if folder in self.cleaned_folder.values():
                    return YACParser(filename=file_path, skip_guess_encoding=True)
                else:
                    return YACParser(filename=file_path)
        else:
            if not original:
                for f in self.cleaned_folder:
                    cleaned_path = os.path.join(self.cleaned_folder[f], fileHash)
                    if os.path.exists(cleaned_path):
                        return YACParser(filename=cleaned_path, skip_guess_encoding=True)

            for f in self.submit_folder:
                submit_path = os.path.join(self.submit_folder[f], fileHash)
                cleaned_path = os.path.join(self.cleaned_folder[f], fileHash)
                if os.path.exists(submit_path):
                    table = YACParser(filename=submit_path)
                    if not os.path.exists(cleaned_path):
                        cleaned = table.generate()
                        storeContent(cleaned, cleaned_path, md5=fileHash)
                    return table
        return None
예제 #2
0
    def test_string_column_labelling(self):
        tagger = GeoTagger('localhost', 27017)

        yacp = YACParser(filename='testdata/AdressenJHB.csv', sample_size=1800)
        tables = parseDataTables(yacp)
        t = tables[0]
        for i, row in enumerate(t.columnIter()):
            tagger.string_column(row)
예제 #3
0
    def submit(self, file=None, url=None, content=None, toFolder=None):
        """
        1) retrieve and compute hash of original content

        2) store submitted content using hash as filename IFF not exist
            optional URL as symlink

        :param file:
        :param url:
        :param content:
        :param toFolder:
        :return: the md5 of the original file
        """
        if toFolder in self.submit_folder and toFolder in self.cleaned_folder:
            s_folder = self.submit_folder[toFolder]
            c_folder = self.cleaned_folder[toFolder]
        else:
            return None

        if file:
            md5 = storeFile(file, s_folder)
        elif url:
            md5 = storeURL(url, s_folder, max_file_size=self.max_file_size)
        elif content:
            md5 = storeContent(content, s_folder)
        else:
            return None

        # check if cleaned exists
        submitted_path=os.path.join(s_folder, md5)
        cleaned_path = os.path.join(c_folder, md5)
        # at first look for stored cleaned version
        if os.path.exists(cleaned_path):
            return md5
        else:
            # generate and store cleaned version
            table = YACParser(filename=submitted_path)
            cleaned = table.generate()
            storeContent(cleaned, c_folder, md5=md5)
            return md5
예제 #4
0
def get_values(filename, col_id):
    # print filename
    tables = YACParser.from_source(filename=filename)

    values = []
    errors = 0
    for t in tables:
        if len(t.header_rows) > 0 and len(t.header_rows[0].cells) > col_id:
            header = t.header_rows[0].cells[col_id].value
        else:
            header = 'MISSING'
        for c in t.columns[col_id].cells:
            try:
                values.append(float(c.value))
            except:
                errors += 1
        break
    if errors > 0:
        print 'NUMBER OF ERRORS:', errors
    return header, values
예제 #5
0
    def test_csv(self):
        client = MongoClient('localhost', 27017)
        tagger = OSMTagger(client)

        db = client.geostore
        q = db.geonames.find({'admin_level': 6, 'parent': "http://sws.geonames.org/2769848/", "country" : "http://sws.geonames.org/2782113/"})

        r_tmp = [get_geonames_id(r['_id']) for r in q]
        regions = []
        for r in r_tmp:
            regions.append(r)
            q = db.geonames.find({'admin_level': 8, 'parent': r,
                                  "country": "http://sws.geonames.org/2782113/"})
            for sub_r in q:
                regions.append(get_geonames_id(sub_r['_id']))

        yacp = YACParser(filename='testdata/AdressenJHB.csv', sample_size=1800)
        tables = parseDataTables(yacp)
        t = tables[0]
        for i, row in enumerate(t.columnIter()):
            tagger.label_values(row, regions)
예제 #6
0
def csvclean_service(url):
    '''
    returns parsed table object from the YACParser
    '''
    table = YACParser(url=url, sample_size=100)
    return table