iob = [] is_inside = False for w in pos: if not re.search("<.*>", w): if is_inside: iob.append("{}/I-T".format(w)) is_inside = False else: iob.append("{}/O".format(w)) elif iob and w == tag: iob[-1] = iob[-1].replace('/O', '/B-T') is_inside = True else: # Other tags - skip them pass return " ".join(iob) def is_self_closing_tag(self, tag): return re.match('<[^<>]+/>', tag) if __name__ == "__main__": from Config.Config import G_CONFIG G_CONFIG.config_logging() task = Task(sys.argv[1]) # Launch RDD parallel processing task.get_rdd().mapPartitionsWithIndex(PosTagTask(task)).foreachPartition( Task.save_segments) task.finalize()
from flask_principal import Principal from flask_jwt import JWT from celery import Celery from datetime import timedelta from Config.Config import G_CONFIG app = Flask(__name__) app.config['SECRET_KEY'] = 'super-secret' app.config['VERSION'] = 1 app.config['PROPAGATE_EXCEPTIONS'] = True # Setup logging handler = G_CONFIG.config_logging() if handler: app.logger.addHandler(handler) # Add file logger stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.DEBUG) app.logger.addHandler(stream_handler) # fix gives access to the gunicorn error log facility app.logger.handlers.extend(logging.getLogger("gunicorn.error").handlers) principals = Principal(app) # Celery configuration app.config['CELERY_BROKER_URL'] = 'redis://localhost:6379/0' app.config['CELERY_RESULT_BACKEND'] = 'redis://localhost:6379/0' # Initialize Celery celery = Celery(app.name, broker=app.config['CELERY_BROKER_URL'])