def get(self): db_init = DBConnection() job_id = request.args.get('id', None) output = [] search_query = {} if job_id: search_query["job_id"] = "job_id" result = db_init.get_job({'job_id': job_id}) date_time_format = '%Y-%m-%d %H:%M:%S' for item in result: updated_time = item["updated_time"] start_time = item["start_time"] downloaded_size = item["downloaded_size"] total_file_size = item["total_file_size"] estimated_time = 0 if downloaded_size and total_file_size: print(updated_time, start_time) diff_sec = date_diff_in_s(updated_time, start_time) estimated_time = ( (float(total_file_size) - float(downloaded_size)) / float(downloaded_size)) * diff_sec db_init.close() temp = dict(item) temp["estimated_time_seconds"] = round(estimated_time, 2) return jsonify(temp)
def download_file(job_id, url, filename, already_processed): print(job_id, url, filename, "xxxxxxxxxxxxx") db_init = DBConnection() file_mode = 'wb' if already_processed == 0 else 'ab' response = requests.get(url, stream=True) total = response.headers.get('content-length') content_type = response.headers.get('Content-Type') file_extension = mimetypes.guess_extension(content_type) write_file_path = filename + file_extension is_break = False with open(write_file_path, file_mode) as f: if total is None: f.write(response.content) else: downloaded = 0 total = int(total) for data in response.iter_content( chunk_size=max(int(total / 1000), 1024 * 1024)): downloaded += len(data) update_query = {} update_query["job_id"] = job_id result = db_init.get_job({'job_id': job_id}) if result: status = result[0]['status'] if status in ["PAUSE", "STOP"]: update_query["status"] = status is_break = True if downloaded > already_processed: already_processed = 0 else: continue if is_break: db_init.update_job(update_query) break f.write(data) done = int(50 * downloaded / total) print( total, downloaded, ) update_query["total_file_size"] = total update_query["downloaded_size"] = downloaded update_query["remaining_size"] = total - downloaded db_init.update_job(update_query) if not is_break: update_query = {} update_query["job_id"] = job_id update_query["end_time"] = datetime.utcnow() update_query["status"] = 'COMPLETED' update_query["command"] = "Finished Download" db_init.close()
class Worker(object): worker_id = "" def do_the_job(self, data): job_id = data["job_id"] status = data["status"] self.db_init = DBConnection() search_query={} search_query["job_id"]=job_id result=self.db_init.get_job({'job_id':job_id}) if result: data=result[0] url=data["input_url"] already_processed=data["downloaded_size"] if status=="RESUME" else 0 download_file(self.db_init,job_id,url,already_processed) self.db_init.close() def __init__(self,): self.queue_name = 'urls' self.exchange_name = 'info' self.host = settings.RABBITMQ_HOST self.user = settings.RABBITMQ_USER self.password = settings.RABBITMQ_PASS self.credentials = pika.PlainCredentials(self.user, self.password) self.connection = pika.BlockingConnection(pika.ConnectionParameters(host=self.host, port=5672, credentials=self.credentials)) self.channel = self.connection.channel() self.channel.queue_declare(queue=self.queue_name) def callback(self, ch, method, properties, body): if body is not None or body != '': data = json.loads(body.decode()) if 'job_id' in data and 'status' in data: try: self.do_the_job(data) except Exception as e: print(str(e)) ch.basic_ack(delivery_tag=method.delivery_tag) print("done with the job by worker - ", self.worker_id)
def post(self): db_init = DBConnection() job_id = str(uuid.uuid1()) url = request.form['url'] if 'url' in request.form else None input_dict = {} input_dict["job_id"] = job_id input_dict["status"] = "SCHEDULED" input_dict["start_time"] = datetime.utcnow() input_dict["output_path"] = job_id input_dict["command"] = "SCHEDULED Download" is_exist = False if not url: input_dict["command"] = "no input url found Job" input_dict["end_time"] = datetime.utcnow() is_exist = True else: input_dict["input_url"] = url db_init.insert_job(input_dict) if is_exist: db_init.close() return jsonify(input_dict) message_to_publish = {} message_to_publish['job_id'] = job_id message_to_publish['status'] = 'SCHEDULED' message_publisher(message_to_publish) update_query = {} update_query["job_id"] = job_id db_init.close() return jsonify(update_query)
def post(self): db_init = DBConnection() f = request.files['file_name'] if 'file_name' in request.files else None if f: path = settings.storage_path file_ = f.filename job_id = str(uuid.uuid1()) filename, file_extension = os.path.splitext(file_) full_path = os.path.join(path, job_id + file_extension) f.save(secure_filename(full_path)) input_dict = {} input_dict["job_id"] = job_id input_dict["status"] = "COMPLETED" input_dict["start_time"] = datetime.utcnow() input_dict["output_path"] = full_path input_dict["command"] = "" input_dict["end_time"] = datetime.utcnow() db_init.insert_job(input_dict) db_init.close() return jsonify(input_dict) else: db_init.close() return "No file found"
def get(self): db_init = DBConnection() job_id = request.args.get('id', None) state = request.args.get('state', None) update_query = {} update_query["status"] = state update_query["job_id"] = job_id if state in ['PAUSE', 'STOP', 'RESUME']: db_init.update_job(update_query) update_query["message"] = 'updated download status' if state == 'RESUME': message_to_publish = {} message_to_publish['job_id'] = job_id message_to_publish['status'] = 'RESUME' message_publisher(message_to_publish) else: update_query["message"] = "no contol found" db_init.close() return jsonify(update_query)
class IndexData(object): def __init__(self, file_name="local_config"): self.resultset = [] settings = __import__("%s" % file_name) self.is_threading = settings.IMPLEMENT_THREADED_SEARCH db_config = settings.DATABASE self.index_classes = settings.INDEX_CLASSES self.is_indexing = settings.INDEXING self.intervals = settings.INTERVALS self.bucket_intervals = settings.BUCKET_INTERVAL self.word_split_pattern = settings.WORD_SPLIT_PATTERN self.conn = DBConnection(db_config['HOST'], db_config['USER'], db_config['PASSWORD'], db_config['NAME'], db_config['PORT'], settings.FILE_PATH) def split_sentence(self, raw_sentence): word_list = resplit(self.word_split_pattern, raw_sentence) return word_list def create_hash(self, sentence): word_list = self.split_sentence(sentence) weight_list = Commands.assign_weight(word_list) return weight_list def false_index(self, data_count, table_name, field_list): dump_file_counts = int(data_count/self.intervals) + 1 start, offset = 0, self.intervals if not self.is_threading: offset = data_count dump_file_counts = 1 for file_no in range(dump_file_counts): self.conn.create_outfile(table_name, field_list, start, offset, file_no) start += self.intervals def true_index(self, data_count, table_name, field_list): interval_count = int(data_count/self.intervals) if data_count > self.intervals else 1 start, offset = 0, self.intervals file_dict = {} pool = Pool(processes=2) args_list = [] for i in range(interval_count): args_list.append([start, offset, table_name, field_list, file_dict],) start += offset pool.map(index_data, args_list) pool.close() pool.join() def index(self): for instance in self.index_classes: data_count = self.conn.get_table_counts(instance.table_name) if data_count: getattr(self, ("%s_index" % self.is_indexing).lower())(data_count, instance.table_name, instance.field_list) else: print "No Data to Index. Exiting...." self.conn.close() @classmethod def run(cls, field_list, table_name, bucket={}): self = IndexData("local_config") result_set = self.conn.get_all_records(field_list, table_name) for pos, data in result_set: word_list = self.create_hash(data) for word, weight in word_list: bucket_no = Commands.assign_bucket(weight, self.bucket_intervals) try: bucket[bucket_no][word].append(pos) except KeyError: bucket[bucket_no][word] = [pos, ] return bucket