예제 #1
0
    def analyseRemote(self, parser, since=None, limit=None):
        if since: pass #todo: implement since
        resultCount = 0

        # apiIterator = self.ApiIterator(self.service, self.endpoint_plural)
        apiIterator = self.getIterator(self.endpoint_plural)
        progressCounter = None
        for page in apiIterator:
            if progressCounter is None:
                total_items = apiIterator.total_items
                if limit:
                    total_items = min(limit, total_items)
                progressCounter = ProgressCounter(total_items)
            progressCounter.maybePrintUpdate(resultCount)

            # if Registrar.DEBUG_API:
            #     Registrar.registerMessage('processing page: %s' % str(page))
            if self.endpoint_plural in page:
                for page_item in page.get(self.endpoint_plural):

                    parser.analyseWpApiObj(page_item)
                    resultCount += 1
                    if limit and resultCount > limit:
                        if Registrar.DEBUG_API:
                            Registrar.registerMessage('reached limit, exiting')
                        return
def geocode_all(db, data_folder="geocoder/data",
                terms_folder="geocoder/terms",
                lines_per_insert=1000):

    print("Loading table...")
    # The query bellow seems not very efficient...
    # Maybe change it as the link says.
    # https://stackoverflow.com/questions/7389759/memory-efficient-built-in-sqlalchemy-iterator-generator
    non_geocoded = Execucao.query.filter(Execucao.searched == False).all()
    with Geocoder(data_folder, terms_folder) as geocoder:
        counter = ProgressCounter(len(non_geocoded), print_abs=True)
        to_be_inserted = 0
        for row in non_geocoded:
            cells = get_geolocable_cells(row)
            geoent = geocoder.geocode_list(cells)
            if geoent:
                lat, lon, reg = geoent.best_coords()
                if lat:
                    row.point = "POINT(%s %s)" % (lon, lat)
            row.searched = True
            to_be_inserted += 1
            if to_be_inserted == lines_per_insert:
                db.session.commit()
                to_be_inserted = 0
            counter.update()
        if to_be_inserted:
            db.session.commit()
        counter.end()
def update_from_csv(db, csv):
    """Update table using values from CSV. Slower than 'insert_csv' but raises
    no error if primary key already exists (just updates values)."""
    table = pd.read_csv(csv)
    pks = create_pks(table)
    counter = ProgressCounter(len(table))
    modified_counter = 0
    added_counter = 0

    for row_i, row in table.iterrows():
        code = pks.iloc[row_i]
        row_model = db.session.query(Execucao).filter_by(code=code).first()
        new_row = prepare_row(code, row)
        date = datetime.datetime.strptime(new_row["data"]["datafinal"], "%Y-%m-%d")
        if row_model:
            modified = {}

            # Check if state was modified
            if row_model.state != new_row["state"].decode("utf8"):
                modified["state"] = (row_model.state, new_row["state"])
                row_model.state = new_row["state"]

            # Check if a field in data was modified
            for key, new_value in new_row["data"].items():
                old_value = row_model.data.get(key)

                # Avoids confusion caused by new_value not been unicode
                if type(new_value) is str:
                    new_value = new_value.decode("utf8")
                    new_row["data"][key] = new_value

                # Avoids data that changed from 0 to None
                if (old_value or new_value) and (old_value != new_value):
                    modified[key] = (old_value, new_value)

            # Avoids registering row as modified if only datafinal changend
            if len(modified) == 1 and "datafinal" in modified:
                modified = {}

            if modified:
                db.session.add(History(event="modified", code=code, date=date, data=modified))
                modified_counter += 1

            # Updates DB data even if only 'datafinal' changed
            row_model.data = new_row["data"]
        else:
            db.session.add(History(event="created", code=code, date=date, data=new_row))
            db.session.add(Execucao(**new_row))
            added_counter += 1
        counter.update()
    counter.end()
    db.session.commit()
    print("Added/Modified/Total: %s/%s/%s" % (added_counter, modified_counter, len(table)))
def insert_all(db, csv_file='../data/contratos-2014.xls', lines_per_insert=100):
    print("Importing Contratos from: {}".format(csv_file))
    data = pd.read_excel(csv_file)
    data = data.fillna(-1)

    cache = {}
    to_insert = []
    total = len(data)
    inserted = 0
    counter = ProgressCounter(total)

    for row_i, row in data.iterrows():

        r = {}

        if len(to_insert) == lines_per_insert:
            inserted += len(to_insert)
            insert_rows(db, to_insert)
            to_insert = []
            # Progress counter
            counter.update(lines_per_insert)

        r['numero'] = int(row_i) + 1
        r['orgao'] = row['Orgao']
        r['data_assinatura'] = parse_date(row['Data da Assinatura'])
        r['vigencia'] = int(row['Vigencia']) if not np.isnan(row['Vigencia']) else -1
        r['objeto'] = row['Objeto']
        r['modalidade'] = row['Modalidade']
        r['evento'] = row['Evento']
        r['processo_administrativo'] = row['Processo Administrativo']
        r['cnpj'] = row['CNPJ']
        r['nome_fornecedor'] = row['Nome']
        r['valor'] = parse_money(row['Valor'])
        r['licitacao'] = row['Licitacao\n']
        r['data_publicacao'] = parse_date(row['Data Publicacao'])

        to_insert.append(r)

    if len(to_insert) > 0:
        inserted += len(to_insert)
        insert_rows(db, to_insert)

    counter.end()

    print("Imported {} Contratos".format(inserted))
예제 #5
0
def insert_csv(db, csv, lines_per_insert):
    table = pd.read_csv(csv)
    pks = create_pks(table)
    # db = get_db()
    counter = ProgressCounter(len(table))

    to_insert = []
    for row_i, row in table.iterrows():
        if len(to_insert) == lines_per_insert:
            insert_rows(db, to_insert)
            to_insert = []
            # Progress counter
            counter.update(lines_per_insert)
        to_insert.append(prepare_row(pks.iloc[row_i], row))

    if len(to_insert) > 0:
        insert_rows(db, to_insert)

    counter.end()
def download_contratos_files(csv_file='../data/urls.csv', directory='../data/contratos'):
    if not os.path.exists(directory):
        os.makedirs(directory)

    def download_and_save(url, directory):
        if not isinstance(url, basestring):
            return

        import os.path
        import urllib2

        filename = url.split('/')[-1]
        path = os.path.join(directory, filename)
        if not os.path.isfile(path):
            file = urllib2.urlopen(url)
            content = file.read()
            with open(path,'w') as f:
                f.write(content)

    print("Downloading Contratos files from: {}".format(csv_file))
    data = pd.read_csv(csv_file)

    total = len(data)
    counter = ProgressCounter(total)

    downloaded = 0
    with futures.ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = dict((executor.submit(download_and_save, d['file_txt'], directory), d['file_txt'])
                             for di, d in data.iterrows())

        for future in futures.as_completed(future_to_url):
            url = future_to_url[future]
            counter.update(1)
            downloaded += 1
            if future.exception() is not None:
                print('%r generated an exception: %s' % (url,
                                                         future.exception()))

    counter.end()

    print("Downloaded {} Contratos".format(downloaded))
예제 #7
0
def update_from_csv(db, csv):
    '''Update table using values from CSV. Slower than 'insert_csv' but raises
    no error if primary key already exists (just updates values).'''
    table = pd.read_csv(csv)
    pks = create_pks(table)
    counter = ProgressCounter(len(table))
    for row_i, row in table.iterrows():
        code = pks.iloc[row_i]
        row_model = db.session.query(Execucao).filter_by(code=code).first()
        new_row = prepare_row(code, row)
        if row_model:
            for key, new_value in new_row.iteritems():
                setattr(row_model, key, new_value)
                # old_value = getattr(row_model, key)
                # if old_value != new_value:
                #     print(key, old_value, new_value)
                #     setattr(row_model, key, new_value)
        else:
            db.session.add(Execucao(**new_row))
        counter.update()
    counter.end()
    db.session.commit()
def insert_all(db, csv_file='../data/urls.csv', lines_per_insert=100):
    print("Getting Contratos urls from: " + csv_file)
    data = pd.read_csv(csv_file)

    total = len(data)
    counter = ProgressCounter(total)

    to_update = 0
    updated = 0
    for di, d in data.iterrows():
        to_update += 1
        stmt = update(Contrato).values({'file_url':d['file_url'], 'txt_file_url':d['file_txt']}).where(Contrato.numero == str(d['numero']))
        db.session.execute(stmt)
        if to_update == lines_per_insert or (updated + to_update) == total:
            counter.update(to_update)
            updated += to_update
            to_update = 0
            db.session.commit()

    counter.end()

    print("Updated {} Contratos".format(updated))
예제 #9
0
def download_contratos_files(csv_file='../data/urls.csv',
                             directory='../data/contratos'):
    if not os.path.exists(directory):
        os.makedirs(directory)

    def download_and_save(url, directory):
        if not isinstance(url, basestring):
            return

        import os.path
        import urllib2

        filename = url.split('/')[-1]
        path = os.path.join(directory, filename)
        if not os.path.isfile(path):
            file = urllib2.urlopen(url)
            content = file.read()
            with open(path, 'w') as f:
                f.write(content)

    print("Downloading Contratos files from: {}".format(csv_file))
    data = pd.read_csv(csv_file)

    total = len(data)
    counter = ProgressCounter(total)

    downloaded = 0
    with futures.ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = dict(
            (executor.submit(download_and_save, d['file_txt'], directory),
             d['file_txt']) for di, d in data.iterrows())

        for future in futures.as_completed(future_to_url):
            url = future_to_url[future]
            counter.update(1)
            downloaded += 1
            if future.exception() is not None:
                print('%r generated an exception: %s' %
                      (url, future.exception()))

    counter.end()

    print("Downloaded {} Contratos".format(downloaded))
예제 #10
0
def insert_csv(db, csv, lines_per_insert):
    table = pd.read_csv(csv)
    pks = create_pks(table)
    # db = get_db()
    counter = ProgressCounter(len(table))

    to_insert = []
    for row_i, row in table.iterrows():
        if len(to_insert) == lines_per_insert:
            insert_rows(db, to_insert)
            to_insert = []
            # Progress counter
            counter.update(lines_per_insert)
        to_insert.append(prepare_row(pks.iloc[row_i], row))

    if len(to_insert) > 0:
        insert_rows(db, to_insert)

    counter.end()
예제 #11
0
def insert_all(db,
               csv_file='../data/contratos-2014.xls',
               lines_per_insert=100):
    print("Importing Contratos from: " + csv_file)
    data = pd.read_excel(csv_file)
    data = data.fillna(-1)

    cache = {}
    to_insert = []
    counter = ProgressCounter(len(data))

    for row_i, row in data.iterrows():

        r = {}

        if len(to_insert) == lines_per_insert:
            insert_rows(db, to_insert)
            to_insert = []
            # Progress counter
            counter.update(lines_per_insert)

        r['numero'] = int(row_i) + 1
        r['orgao'] = row['Orgao']
        r['data_assinatura'] = parse_date(row['Data da Assinatura'])
        r['vigencia'] = int(
            row['Vigencia']) if not np.isnan(row['Vigencia']) else -1
        r['objeto'] = row['Objeto']
        r['modalidade'] = row['Modalidade']
        r['evento'] = row['Evento']
        r['processo_administrativo'] = row['Processo Administrativo']
        r['cnpj'] = row['CNPJ']
        r['nome_fornecedor'] = row['Nome']
        r['valor'] = parse_money(row['Valor'])
        r['licitacao'] = row['Licitacao\n']
        r['data_publicacao'] = parse_date(row['Data Publicacao'])

        to_insert.append(r)

    if len(to_insert) > 0:
        insert_rows(db, to_insert)

    counter.end()
예제 #12
0
def insert_csv(db, csv, lines_per_insert):
    print(csv)
    table = pd.read_csv(csv)
    counter = ProgressCounter(len(table))

    # ## Add code column ## #
    code_series = [
        col for name, col in table.iteritems() if name[:3].lower() == "cd_"
    ]
    # this column doesn't start with "cd_" but is a code
    code_series.append(table["projetoatividade"])
    # create table of codes
    code_table = pd.concat(code_series, axis=1)
    # create PK Series
    pks = pd.Series([
        '.'.join([str(value) for value in row[1]])
        for row in code_table.iterrows()
    ],
                    name="code")
    # check pk uniqueness
    if pks.duplicated().values.sum() > 0:
        print("Warning: There are duplicated pks!")
    # add the pk series to the table
    # table = pd.concat([table, pks], axis=1)
    # ## --------------- ## #

    to_insert = []

    for row_i, row in table.iterrows():

        if len(to_insert) == lines_per_insert:
            insert_rows(db, to_insert)
            to_insert = []
            # Progress counter
            counter.update(lines_per_insert)

        to_insert.append({"code": pks.iloc[row_i], "data": dict(row.iterkv())})

    if len(to_insert) > 0:
        insert_rows(db, to_insert)

    counter.end()
예제 #13
0
def update_from_csv(db, csv):
    '''Update table using values from CSV. Slower than 'insert_csv' but raises
    no error if primary key already exists (just updates values).'''
    table = pd.read_csv(csv)
    pks = create_pks(table)
    counter = ProgressCounter(len(table))
    for row_i, row in table.iterrows():
        code = pks.iloc[row_i]
        row_model = db.session.query(Execucao).filter_by(code=code).first()
        new_row = prepare_row(code, row)
        if row_model:
            for key, new_value in new_row.iteritems():
                setattr(row_model, key, new_value)
                # old_value = getattr(row_model, key)
                # if old_value != new_value:
                #     print(key, old_value, new_value)
                #     setattr(row_model, key, new_value)
        else:
            db.session.add(Execucao(**new_row))
        counter.update()
    counter.end()
    db.session.commit()
예제 #14
0
        if(Registrar.DEBUG_MESSAGE):
            print "email matches (%d)" % len(emailMatcher.matches)
        #     print emailMatcher.matches.tabulate(tablefmt="simple")


        # TODO: further sort emailMatcher

        print debugUtils.hashify("BEGINNING MERGE (%d)" % len(globalMatches))
        print timediff()


        syncCols = ColData_User.getSyncCols()

        if Registrar.DEBUG_PROGRESS:
            syncProgressCounter = ProgressCounter(len(globalMatches))

        for count, match in enumerate(globalMatches):
            if Registrar.DEBUG_PROGRESS:
                syncProgressCounter.maybePrintUpdate(count)

            mObject = match.mObjects[0]
            sObject = match.sObjects[0]

            syncUpdate = SyncUpdate_Usr(mObject, sObject)
            syncUpdate.update(syncCols)

            # SanitationUtils.safePrint( syncUpdate.tabulate(tablefmt = 'simple'))

            if syncUpdate.mUpdated and syncUpdate.mDeltas:
                insort(mDeltaUpdates, syncUpdate)
예제 #15
0
def insert_all(db, csv_file='../data/receitas_min.csv', lines_per_insert=100):
    print("Importing Revenues from: " + csv_file)
    data = pd.read_csv(csv_file, encoding='utf8')

    cache = {}
    to_insert = []
    counter = ProgressCounter(len(data))

    for row_i, row in data.iterrows():

        r = {}

        if len(to_insert) == lines_per_insert:
            insert_rows(db, to_insert)
            to_insert = []
            # Progress counter
            counter.update(lines_per_insert)

        r['original_code'] = row['codigo']
        r['description'] = row['descricao']
        r['date'] = parse_date(row['data'])
        r['monthly_outcome'] = parse_money(row['realizado_mensal'])
        r['monthly_predicted'] = parse_money(row['previsto_mensal'])
        code_parsed = parse_code(row['codigo'])
        r['economical_category'] = code_parsed[0]

        # Insert code reference
        code_parts = map(int, r['original_code'].split('.'))
        len_cp = len(code_parts)

        for i in range(len_cp):
            code = '.'.join(map(str, code_parts[:len_cp - i]))
            if code not in cache:
                code_result = db.session.query(
                    RevenueCode.id).filter(RevenueCode.code == code).all()
                if code_result:
                    cache[code] = code_result[0][0]
                    r['code_id'] = code_result[0][0]
                    break
            else:
                r['code_id'] = cache[code]
                break
        else:
            r['code_id'] = None

        if len(code_parsed) >= 2:
            r['economical_subcategory'] = code_parsed[1]
        else:
            r['economical_subcategory'] = None

        if len(code_parsed) >= 3:
            r['source'] = code_parsed[2]
        else:
            r['source'] = None

        if len(code_parsed) >= 4:
            r['rubric'] = code_parsed[3]
        else:
            r['rubric'] = None

        if len(code_parsed) >= 5:
            r['paragraph'] = code_parsed[4]
        else:
            r['paragraph'] = None

        if len(code_parsed) == 6:
            r['subparagraph'] = code_parsed[5]
        else:
            r['subparagraph'] = None

        to_insert.append(r)

    if len(to_insert) > 0:
        insert_rows(db, to_insert)

    counter.end()
예제 #16
0
 def printFileProgress(self, completed, total):
     if not hasattr(self, 'progressCounter'):
         self.progressCounter = ProgressCounter(total)
     self.progressCounter.maybePrintUpdate(completed)
예제 #17
0
class UsrSyncClient_SSH_ACT(SyncClient_Abstract):
    def __init__(self, connectParams, dbParams, fsParams):
        self.dbParams = dbParams
        self.fsParams = fsParams
        super(UsrSyncClient_SSH_ACT, self).__init__(connectParams)

    def attemptConnect(self):
        self.service = paramiko.SSHClient()
        self.service.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        self.service.connect(**self.connectParams)

    @property
    def connectionReady(self):
        return self.service and self.service._transport and self.service._transport.active

    def execSilentCommandAssert(self, command):
        self.assertConnect()
        stdin, stdout, stderr = self.service.exec_command(command)
        if stdin: pass #gets rid of annoying warnings
        possible_errors = stdout.readlines() + stderr.readlines()
        for error in possible_errors:
            if re.match("^Countries.*", error):
                print error
                continue
            assert not error, "command <%s> returned errors: %s" % (
                SanitationUtils.coerceUnicode(command),
                SanitationUtils.coerceUnicode(error)
            )

    def putFile(self, localPath, remotePath):
        self.assertConnect()

        # remoteDir, remoteFileName = os.path.split(remotePath)
        remoteDir = os.path.split(remotePath)[0]

        exception = Exception()
        sftpClient = self.service.open_sftp()
        if remoteDir:
            try:
                sftpClient.stat(remoteDir)
            except:
                sftpClient.mkdir(remoteDir)
        sftpClient.put(localPath, remotePath)
        fstat = sftpClient.stat(remotePath)
        sftpClient.close()

        if not fstat:
            exception = UserWarning("could not stat remote file")

        # try:
        #     sftpClient = self.service.open_sftp()
        #     if remoteDir:
        #         try:
        #             sftpClient.stat(remoteDir)
        #         except:
        #             sftpClient.mkdir(remoteDir)
        #     sftpClient.put(localPath, remotePath)
        #     fstat = sftpClient.stat(remotePath)
        #     if not fstat:
        #         exception = UserWarning("could not stat remote file")
        # except Exception, e:
        #     exception = e
        # finally:
        #     sftpClient.close()
        # if not isinstance(exception, Exception):
        #     raise exception


    def assertRemoteFileExists(self, remotePath, assertion = ""):
        self.assertConnect()

        # stdin, stdout, stderr = self.service.exec_command('stat "%s"' % remotePath)
        stderr = self.service.exec_command('stat "%s"' % remotePath)[2]
        possible_errors = stderr.readlines()
        assert not possible_errors, " ".join([assertion, "stat returned possible errors", str(possible_errors)])

    @classmethod
    def printFileProgress(self, completed, total):
        if not hasattr(self, 'progressCounter'):
            self.progressCounter = ProgressCounter(total)
        self.progressCounter.maybePrintUpdate(completed)

    def getDeleteFile(self, remotePath, localPath):
        self.assertRemoteFileExists(remotePath)

        sftpClient = self.service.open_sftp()
        sftpClient.get(remotePath, localPath, self.printFileProgress)
        sftpClient.remove(remotePath)
        sftpClient.close()

        # exception = None
        # try:
        #     sftpClient = self.service.open_sftp()
        #     sftpClient.get(remotePath, localPath, self.printFileProgress)
        #     sftpClient.remove(remotePath)
        # except Exception, e:
        #     exception = e
        # finally:
        #     sftpClient.close()
        # if exception:
        #     raise exception

    def removeRemoteFile(self, remotePath):
        self.assertRemoteFileExists(remotePath)
        self.service.exec_command('rm "%s"' % remotePath)

    def uploadChanges(self, user_pkey, updates=None):
        if not updates:
            return
        # print "UPDATES:", updates

        self.assertConnect()

        if 'MYOB Card ID' in updates:
            del updates['MYOB Card ID']

        updates = OrderedDict(
            [('MYOB Card ID', user_pkey)] \
            + updates.items()
        )

        importName = self.fsParams['importName']
        outFolder = self.fsParams['outFolder']
        remote_export_folder = self.fsParams['remote_export_folder']
        fileRoot = 'act_i_' + importName + '_' + user_pkey
        fileName = fileRoot + '.csv'
        localPath = os.path.join(outFolder, fileName)
        remotePath = os.path.join(remote_export_folder, fileName)
        importedFile = os.path.join(remote_export_folder, fileRoot + '.imported')

        with open(localPath, 'w+') as outFile:
            csvdialect = UnicodeCsvDialectUtils.act_out
            dictwriter = unicodecsv.DictWriter(
                outFile,
                dialect=csvdialect,
                fieldnames=updates.keys(),
                encoding='utf8',
                extrasaction='ignore',
            )
            dictwriter.writeheader()
            dictwriter.writerow(updates)

        self.putFile( localPath, remotePath)

        tokens = [
            'cd ' + remote_export_folder + ';',
            '{db_i_exe} "-d{db_name}" "-h{db_host}" "-u{db_user}" "-p{db_pass}"'.format(
                **self.dbParams
            ),
            ('"%s"' % fileName) if fileName else None
        ]

        command = " ".join( token for token in tokens if token)

        # command = " ".join(filter(None,))
        #
        # command = " ".join(filter(None,[
        #     'cd ' + remote_export_folder + ';',
        #     '{db_i_exe} "-d{db_name}" "-h{db_host}" "-u{db_user}" "-p{db_pass}"'.format(
        #         **self.dbParams
        #     ),
        #     ('"%s"' % fileName) if fileName else None
        # ]))

        self.execSilentCommandAssert(command)

        try:
            self.removeRemoteFile(importedFile)
        except:
            raise Exception("import didn't produce a .imported file")

    def analyseRemote(self, parser, since=None, limit=None):
        if not since:
            since = '1970-01-01'
        if limit:
            # todo: implement limit
            # this gets rid of unused argument warnings
            pass

        importName = self.fsParams['importName']
        remote_export_folder = self.fsParams['remote_export_folder']
        fileRoot = 'act_x_' + importName
        fileName = fileRoot + '.csv'
        inFolder = self.fsParams['inFolder']
        localPath = os.path.join(inFolder, fileName)
        remotePath = os.path.join(remote_export_folder, fileName)

        tokens = [
            'cd ' + remote_export_folder + ';',
            '{db_x_exe} "-d{db_name}" "-h{db_host}" "-u{db_user}" "-p{db_pass}" -c"{fields}"'.format(
                **self.dbParams
            ),
            '-s"%s"' % since,
            '"%s"' % fileName
        ]

        command = " ".join([token for token in tokens if token])

        # command = " ".join(filter(None,[
        #     'cd ' + remote_export_folder + ';',
        #     '{db_x_exe} "-d{db_name}" "-h{db_host}" "-u{db_user}" "-p{db_pass}" -c"{fields}"'.format(
        #         **self.dbParams
        #     ),
        #     '-s"%s"' % since,
        #     '"%s"' % fileName
        # ]))

        print "executing export command..."
        self.execSilentCommandAssert(command)
        print "donloading file..."
        self.getDeleteFile(remotePath, localPath)
        print "analysing file..."
        parser.analyseFile(localPath, dialect_suggestion='act_out')