def main(): op = OptionParser() op.add_option("--start", "-s", default=0, type="int") op.add_option("--end", "-e", default=100, type="int") op.add_option("--interval", "-i", default=100, type="int") options, arguments = op.parse_args() START = options.start TOTAL = options.end LIMIT = options.interval conf = "big_rds.conf" cmd = 'python unique_identifier_cli.py -f {0} -u "{1}"' timeout = 120 # in seconds, more than 2minutes seems like an eternity with open(conf, "r") as f: config = js.loads(f.read()) # our connection engine = sqla.create_engine(config.get("connection"), pool_timeout=360) Session = sessionmaker() Session.configure(bind=engine) session = Session() @event.listens_for(engine, "engine_connect") def ping_connection(connection, branch): if branch: return try: connection.scalar(select([1])) except exc.DBAPIError as err: if err.connection_invalidated: connection.scalar(select([1])) else: raise for i in xrange(START, TOTAL, LIMIT): print "***** START INTERVAL: ", i # for any clean runs for response in session.query(Response).filter(Response.format == "xml").limit(LIMIT).offset(i).all(): # join_query = session.query(UniqueIdentifier.response_id) # for response in session.query(Response).filter( # and_(Response.format == 'xml', ~Response.id.in_(join_query)) # ).limit(LIMIT).offset(i).all(): print "\tready" response_id = response.id if response.identifiers: continue print "\tgo" cleaned_content = response.cleaned_content # put it in a tempfile to deal with # very long files and paper over the # encoding, escaping junk handle, name = tempfile.mkstemp(suffix=".xml") write(handle, cleaned_content) close(handle) tc = TimedCmd(cmd.format(name, response.source_url)) try: status, output, error = tc.run(timeout) except Exception as ex: print "******propagated failed extraction: ", response_id # traceback.print_exc() print continue finally: unlink(name) if error: print "******error from cli: ", response_id print error print continue commits = [] for i in output.split("\n"): if not i: continue ident = js.loads(i) identifier = UniqueIdentifier( response_id=response_id, tag=ident.get("tag"), extraction_type=ident.get("extraction_type"), match_type=ident.get("match_type"), original_text=ident.get("original_text"), potential_identifier=ident.get("potential_identifier"), ) commits.append(identifier) try: session.add_all(commits) session.commit() except Exception as ex: print "**********failed commit: ", response_id print ex print session.rollback() print "\tcommitted" session.close()
def main(): op = OptionParser() op.add_option('--files', '-f') options, arguments = op.parse_args() conf = 'big_rds.conf' cmd = 'python unique_identifier_cli.py -f {0} -u "{1}"' timeout = 120 # in seconds, more than 2minutes seems like an eternity with open(conf, 'r') as f: config = js.loads(f.read()) # our connection engine = sqla.create_engine(config.get('connection'), pool_timeout=360) Session = sessionmaker() Session.configure(bind=engine) session = Session() @event.listens_for(engine, "engine_connect") def ping_connection(connection, branch): if branch: return try: connection.scalar(select([1])) except exc.DBAPIError as err: if err.connection_invalidated: connection.scalar(select([1])) else: raise for f in options.files.split(','): with open(f, 'r') as g: data = [int(a.strip()) for a in g.readlines() if a] for d in data: response = session.query(Response).filter(Response.id == d).first() print '\tready' response_id = response.id if response.identifiers: continue print '\tgo' cleaned_content = response.cleaned_content # put it in a tempfile to deal with # very long files and paper over the # encoding, escaping junk handle, name = tempfile.mkstemp(suffix='.xml') write(handle, cleaned_content) close(handle) tc = TimedCmd(cmd.format(name, response.source_url)) try: status, output, error = tc.run(timeout) except Exception as ex: print '******propagated failed extraction: ', response_id # traceback.print_exc() print continue finally: unlink(name) if error: print '******error from cli: ', response_id print error print continue commits = [] for i in output.split('\n'): if not i: continue ident = js.loads(i) identifier = UniqueIdentifier( response_id=response_id, tag=ident.get('tag'), extraction_type=ident.get('extraction_type'), match_type=ident.get('match_type'), original_text=ident.get('original_text'), potential_identifier=ident.get('potential_identifier') ) commits.append(identifier) try: session.add_all(commits) session.commit() except Exception as ex: print '**********failed commit: ', response_id print ex print session.rollback() print '\tcommitted' session.close()