def callback(ch, method, properties, body): data = json.loads(body) # We need: # - name: name of the current sheet # - partials: file names of the additional MEI files (including extension) (list). # These "partial" MEIs need to be in the "whole" folder of the sheet already, just like the skeleton.mei sheet_name = data['name'] partial_file_names = data['partials'] # Get sheet id (for status queue) sheet_id = str(db[cfg.col_sheet].find_one({"name": sheet_name})["_id"]) whole_dir = fsm.get_sheet_whole_directory(sheet_name) skeleton_path = whole_dir / 'aligned.mei' partial_paths = [whole_dir / partial for partial in partial_file_names] # skeleton always has 1 section which just contains the measures and some additional tags skeleton_document = xml.parse(str(skeleton_path)).documentElement skeleton_section = skeleton_document.getElementsByTagName("section")[0] skeleton_section_xml = tt.purge_non_element_nodes(skeleton_section).toxml() partial_sections_xml = [] for partial_path in partial_paths: if partial_path.is_file(): partial = xml.parse(str(partial_path)) # We have to extract the measures and put them under a "fake" section root to get a similar structure as the skeleton partial = tt.replace_child_nodes( tt.create_element_node("section"), partial.getElementsByTagName("measure")) partial = tt.purge_non_element_nodes(partial) partial_sections_xml.append(partial.toxml()) # Perform the alignments and node picking aligned_trees = ta.align_trees_multiple([skeleton_section_xml] + partial_sections_xml) final_section_tree, _ = ta.build_consensus_tree( aligned_trees, consensus_method=ta.consensus_bnd_enrich_skeleton) # The final tree only aligned the section with measures, so we need to put the contents of that section back now tt.replace_child_nodes(skeleton_section, final_section_tree.childNodes) # Write the final tree to a file with open(whole_dir / 'aligned.mei', 'w') as aligned_mei_file: # We also purge everything that is not an element, to keep the tree clean and easily output a prettified XML file aligned_mei_file.write( tt.purge_non_element_nodes(skeleton_document).toprettyxml()) # Update status status_update_msg = { '_id': sheet_id, 'module': 'aligner', 'status': 'complete', 'name': sheet_name } global channel channel.queue_declare(queue=cfg.mq_omr_planner_status) channel.basic_publish(exchange="", routing_key=cfg.mq_omr_planner_status, body=json.dumps(status_update_msg))
def callback(ch, method, properties, body): # Decode body and obtain pdf id data = json.loads(body) pdf_id = data['_id'] # Initiate mongo client and sheet collection client = MongoClient(settings.mongo_address[0], int(settings.mongo_address[1])) db = client.trompa_test sheet_collection = db[settings.sheet_collection_name] # Get PDF sheet entry pdf_sheet = sheet_collection.find_one(ObjectId(pdf_id)) print(pdf_sheet) pdf_sheet_path = Path(pdf_sheet["sheet_path"]) pdf_sheet_name = pdf_sheet_path.stem if not pdf_sheet: raise Exception(f"PDF Sheet under id {pdf_id} does not exist!") # PDF -> JPEG print("Converting PDF to JPEG page images...") pages = convert_from_path(pdf_sheet_path.absolute(), 300) img_pages_path = fsm.get_sheet_pages_directory(pdf_sheet_name) for index, page in enumerate(pages): page_path = img_pages_path / f'page_{index}.jpg' page.save(page_path, 'JPEG') sheet_collection.update_one({'sheet_path': str(pdf_sheet_path)}, {'$push': { 'pages_path': str(page_path) }}, upsert=True) print(f"{index} pages out of {len(pages)}") print("DONE") # JPEG -> MEI print("Converting JPEG pages to MEI skeleton...") to_mei.run(pdf_sheet_name) # Update sheet on mongo # TODO: This doesn't seem necessary given that the mei will always be called "aligned.mei", the fsm can handle the paths mei_path = fsm.get_sheet_whole_directory(pdf_sheet_name) / "aligned.mei" sheet_collection.update_one({'_id': ObjectId(pdf_id)}, {'$push': { 'mei_path': str(mei_path) }}, upsert=True) # Output name to sheet queue status_update_msg = { '_id': pdf_id, 'module': 'measure_detector', 'status': 'complete', 'name': pdf_sheet_name } add_to_queue('status_queue', 'status_queue', json.dumps(status_update_msg)) print( f"Published PDF->MEI converted sheet {pdf_sheet_name} to message queue!" )
def callback(ch, method, properties, body): data = json.loads(body) sheet_name = data['name'] task_id = data['task_id'] client = MongoClient(settings.mongo_address[0], int(settings.mongo_address[1])) db = client.trompa_test sheet_id = str(db[settings.sheet_collection_name].find_one( {"name": sheet_name})["_id"]) # Obtain aggregated XML aggregated_result = db[ settings.aggregated_result_collection_name].find_one( {"task_id": task_id}) aggregated_xml = xml.parseString("<mei>" + aggregated_result["xml"] + "</mei>") aggregated_dict = { x.attributes["n"].value: x for x in aggregated_xml.getElementsByTagName("measure") } # Get MEI file and measures mei_path = fsm.get_sheet_whole_directory(sheet_name) / "aligned.mei" mei_xml = xml.parse(str(mei_path)) mei_measures = mei_xml.getElementsByTagName("measure") # Replace measures with new info for measure in mei_measures: mei_n = measure.attributes["n"].value if mei_n in aggregated_dict: measure.childNodes = aggregated_dict[mei_n].childNodes # Write MEI file with open(str(mei_path), 'w') as mei_file: mei_file.write(mei_xml.toxml()) status_update_msg = { '_id': sheet_id, 'module': 'score_rebuilder', 'status': 'complete', 'name': sheet_name, 'task_id': task_id } global channel channel.queue_declare(queue="status_queue") channel.basic_publish(exchange="", routing_key="status_queue", body=json.dumps(status_update_msg))
def callback(ch, method, properties, body): data = json.loads(body) sheet_name = data['name'] task_id = data['task_id'] client = MongoClient(cfg.mongodb_address.ip, cfg.mongodb_address.port) db = client[cfg.db_name] # Get MEI file and measures mei_path = fsm.get_sheet_whole_directory(sheet_name) / "aligned.mei" mei_xml_tree = xml.parse(str(mei_path)) # mei_measures = mei_xml.getElementsByTagName("measure") # Obtain corresponding task and slice task = db[cfg.col_task].find_one({"_id": ObjectId(task_id)}) # measure_staff_slice = db[cfg.col_slice].find_one({"_id" : ObjectId(task["slice_id"])}) # slice_measures = mei_measures[measure_staff_slice["start"]: measure_staff_slice["end"]] # Get aggregated XML aggregated_result = db[cfg.col_aggregated_result].find_one({ "task_id": task_id, "step": task["step"] }) if aggregated_result: aggregated_xml = aggregated_result["result"] # Temporary solution: give the slice somewhat more context by inserting only the header of the previous measure into it tree = xml.parseString(aggregated_xml).documentElement index = int(tree.getElementsByTagName("measure")[0].getAttribute( "n")) - 1 # n-index is shifted up by 1 if index > 0: measure = mei_xml_tree.getElementsByTagName("measure")[ index - 1].cloneNode(deep=True) # get the previous measure measure.childNodes = [] tree.insertBefore(measure, tree.childNodes[0]) aggregated_xml = tree.toxml() # Perform combination with original MEI via tree aligner mei_section = mei_xml_tree.getElementsByTagName("section")[0] mei_section_xml = mei_section.toxml() aligned_trees = ta.align_trees_multiple( [mei_section_xml, aggregated_xml], distance_function=ta.node_distance_anchored) final_section_tree, _ = ta.build_consensus_tree( aligned_trees, consensus_method=ta.consensus_bnd_override_inner) tt.replace_child_nodes(mei_section, final_section_tree.childNodes) # Write MEI file with open(str(mei_path), 'w') as mei_file: mei_file.write( tt.purge_non_element_nodes( mei_xml_tree.documentElement).toprettyxml()) status_update_msg = { '_id': task_id, 'module': 'score_rebuilder', 'status': 'complete' } else: print( f"Aggregated result for task with id {task_id} at step {task['step']} did not exist!" ) status_update_msg = { '_id': task_id, 'module': 'score_rebuilder', 'status': 'failed' } global channel channel.queue_declare(queue=cfg.mq_task_scheduler_status) channel.basic_publish(exchange="", routing_key=cfg.mq_task_scheduler_status, body=json.dumps(status_update_msg))
def callback(ch, method, properties, body): # Decode body and obtain pdf id data = json.loads(body) pdf_id = data['_id'] # Initiate mongo client and sheet collection client = MongoClient( cfg.mongodb_address.ip, cfg.mongodb_address.port) db = client[cfg.db_name] sheet_collection = db[cfg.col_sheet] # Get PDF sheet entry pdf_sheet = sheet_collection.find_one(ObjectId(pdf_id)) print(pdf_sheet) pdf_sheet_path = Path(pdf_sheet["sheet_path"]) pdf_sheet_name = pdf_sheet_path.stem if not pdf_sheet: raise Exception(f"PDF Sheet under id {pdf_id} does not exist!") # PDF -> JPEG print("Converting PDF to JPEG page images...") i = 1 pages = [] img_pages_path = fsm.get_sheet_pages_directory(pdf_sheet_name) while True: try: page = convert_from_path(pdf_sheet_path.absolute(), 300, first_page=i, last_page=i+1)[0] page_path = img_pages_path / f'page_{i}.jpg' page.save(page_path, 'JPEG') sheet_collection.update_one({'sheet_path': str(pdf_sheet_path)}, {'$push': {'pages_path': str(page_path)}}) del page print(f"{i} pages out of {len(pages)}") except: print("Reached end of PDF") break i += 1 print("PDF conversion finished succesfully!") # JPEG -> MEI if (fsm.skeleton_exists(pdf_sheet_name)): print("Using pre-existing skeleton, skipping measure detection...") else: print("Converting JPEG pages to MEI skeleton via measure detector...") to_mei.run(pdf_sheet_name, connection) # Update sheet on mongo mei_path = fsm.get_sheet_whole_directory(pdf_sheet_name) / "aligned.mei" sheet_collection.update_one({'_id': ObjectId(pdf_id)}, {'$push': {'mei_path': str(mei_path)}}) # Output name to sheet queue status_update_msg = { '_id': pdf_id, 'module': 'measure_detector', 'status': 'complete', 'name': pdf_sheet_name} add_to_queue( cfg.mq_omr_planner_status, cfg.mq_omr_planner_status, json.dumps(status_update_msg)) print(f"Published PDF->MEI converted sheet {pdf_sheet_name} to message queue!")
def __init__(self, name): # Create all relevant paths and names mei_path = fsm.get_sheet_whole_directory(name) / "aligned.mei" self.pages_path = fsm.get_sheet_pages_directory(name) self.name = name # Data structures self.measures = [] self.lines = [] self.pages = [] self.images = {} # MEI parsing self.mei = xml.parse(str(mei_path)) # Storing the zones in a dict and collect page images image_names = [] zones = {} for surface in self.mei.getElementsByTagName("surface"): graphic = surface.getElementsByTagName("graphic")[0] image_name = graphic.attributes["target"].value image_names.append(image_name) for zone in surface.getElementsByTagName("zone"): zones[zone.attributes["xml:id"].value] = zone line = [] page = [] entries = [x for x in self.mei.getElementsByTagName("section")[0].childNodes if x.nodeType==xml.Node.ELEMENT_NODE] skipped_first_page_tag = False accumulated_score_def = self.mei.getElementsByTagName("scoreDef")[0] # The initial one last_score_def_index = -1 score_def_before_measure = None self.context = self.build_initial_context() for entry_index, entry in enumerate(entries): if entry.tagName == "pb": if not skipped_first_page_tag: skipped_first_page_tag = True else: self.pages.append(Page(tuple(page), page[0].measures[0].index, len(self.pages), image_names[len(self.pages)])) del page[:] if entry.tagName == "scoreDef": self.update_score_def_with_score_def(accumulated_score_def, entry) last_score_def_index = entry_index score_def_before_measure = entry pass if entry.tagName == "sb" or entry.tagName == "pb" and line: line_obj = Line(tuple(line), line[0].index, len(self.lines)) self.lines.append(line_obj) page.append(line_obj) del line[:] if entry.tagName == "measure": staffs = [] for staff in entry.getElementsByTagName("staff"): zone = zones[staff.attributes["facs"].value[1:]] ulc = tuple([int(v) for v in (zone.attributes["ulx"].value, zone.attributes["uly"].value)]) # Upper left corner lrc = tuple([int(v) for v in (zone.attributes["lrx"].value, zone.attributes["lry"].value)]) # Lower right corner # TODO: this seems redundant, should be solved by making specific slices instead has_clef = False # If the line list is empty, this measure is the first measure, and thus the staff contains a clef if not line: has_clef = True inner_xml = staff.toxml() score_staff = Staff(ulc, lrc, lrc[0]-ulc[0], lrc[1]-ulc[1], len(staffs), len(self.measures), len(self.lines), len(self.pages), inner_xml, has_clef) staffs.append(score_staff) # Adapt context for measure measure_context = self.context.cloneNode(deep=True) measure_context_score_def = measure_context.getElementsByTagName("scoreDef")[0] self.update_score_def_with_score_def(measure_context_score_def, accumulated_score_def) for staffDef in measure_context_score_def.getElementsByTagName("staffDef"): staff_n = staffDef.getAttribute("n") clef, clef_entry_index = self.backtrack_first_staff_with_clef(entry, entries, staff_n) # If the clef came later than the last scoredef, we should override the clef if clef and clef_entry_index > last_score_def_index: self.update_score_def_with_clef(measure_context_score_def, clef, staff_n) score_def_before_measure_xml = None if score_def_before_measure: score_def_before_measure_xml = score_def_before_measure.toxml() score_def_before_measure = None score_measure = Measure( staffs, len(self.measures), entry.toxml(), measure_context.toxml(), score_def_before_measure_xml) self.measures.append(score_measure) line.append(score_measure)
def callback(ch, method, properties, body): data = json.loads(body) sheet_name = data['name'] post_processing_steps = data["steps"] task_id = data['task_id'] # Get MEI file mei_path = fsm.get_sheet_whole_directory(sheet_name) / "aligned.mei" mei_xml_tree = tt.purge_non_element_nodes(xml.parse(str(mei_path))) mei_section = mei_xml_tree.getElementsByTagName("section")[0] if "clef" in post_processing_steps: print(f"Performing clef post-processing for sheet {sheet_name}") for layer in mei_xml_tree.getElementsByTagName("layer"): element = layer.firstChild if element != None and element.tagName=="clef": staff = layer.parentNode measure = staff.parentNode clef_line = element.getAttribute("line") clef_shape = element.getAttribute("shape") layer.removeChild(element) prev = measure.previousSibling scoreDef = None while prev: if prev.tagName == "measure": break if prev.tagName == "scoreDef": scoreDef = prev break prev = prev.previousSibling # TODO: actually generalize this code if not scoreDef: scoreDef = tt.create_element_node("scoreDef") mei_section.insertBefore(scoreDef, measure) staffGrp = tt.first_or_none(scoreDef, "staffGrp") if not staffGrp: staffGrp = tt.create_element_node("staffGrp") scoreDef.appendChild(staffGrp) staffDef = tt.first_or_none(staffGrp, "staffDef", lambda e: e.getAttribute("n") == staff.getAttribute("n")) if not staffDef: staffDef = tt.create_element_node("staffDef", {"n": staff.getAttribute("n")}) staffGrp.appendChild(staffDef) staffDef.setAttribute("clef.line", clef_line) staffDef.setAttribute("clef.shape", clef_shape) # Write MEI file if there were changes if post_processing_steps: with open(str(mei_path), 'w') as mei_file: mei_file.write(tt.purge_non_element_nodes(mei_xml_tree.documentElement).toprettyxml()) status_update_msg = { '_id': task_id, 'module': 'post_processing', 'status': 'complete' } global channel channel.queue_declare(queue=cfg.mq_task_scheduler_status) channel.basic_publish(exchange="", routing_key=cfg.mq_task_scheduler_status, body=json.dumps(status_update_msg))
def run(sheet_name, connection): import datetime import sys sys.path.append("..") import common.file_system_manager as fsm from common.settings import cfg from glob import glob import json import os from uuid import uuid4 from lxml import etree from PIL import Image, ImageFont from PIL.ImageDraw import ImageDraw import requests from tqdm import tqdm version = '1.0.0' template = f'''<?xml version="1.0" encoding="UTF-8"?> <mei xmlns="http://www.music-encoding.org/ns/mei"> <meiHead> <fileDesc> <titleStmt> <title/> </titleStmt> <pubStmt/> </fileDesc> <encodingDesc> <appInfo> <application isodate="{datetime.datetime.now().replace(microsecond=0).isoformat()}" version="{version}"> <name>MeasureDetector</name> <p>Measures detected with MeasureDetector</p> </application> </appInfo> </encodingDesc> </meiHead> <music> <facsimile> </facsimile> <body> </body> </music> </mei>'''.encode() def draw_boxes(image_path, measures): image = Image.open(image_path).convert('RGBA') overlay = Image.new('RGBA', image.size) image_draw = ImageDraw(overlay) for measure in measures: image_draw.rectangle([ int(measure['left']), int(measure['top']), int(measure['right']), int(measure['bottom']) ], fill='#00FFFF1B') for m, measure in enumerate(measures): image_draw.rectangle([ int(measure['left']), int(measure['top']), int(measure['right']), int(measure['bottom']) ], outline='#008888', width=2) result_image = Image.alpha_composite(image, overlay).convert('RGB') target_dir = os.path.join(os.path.dirname(image_path), 'bboxes') os.makedirs(target_dir, exist_ok=True) basename = os.path.basename(image_path) result_path = os.path.join(target_dir, basename) result_image.save(result_path) # Detect measures page_path = fsm.get_sheet_pages_directory(sheet_name) image_paths = sorted( [str(p.resolve()) for p in page_path.iterdir() if p.is_file()], key=lambda x: int(os.path.basename(x).split('_')[1].split('.')[0])) pages = [] tqdm.write(f'Detecting measures in {len(image_paths)} images...') for image_path in tqdm(image_paths, unit='img'): with open(image_path, 'rb') as image: address = ":".join(map(str, cfg.measure_detector_address)) response = requests.post(f'http://{address}/upload', files={'image': image}) measures = json.loads(response.content.decode('utf-8'))['measures'] pages.append({'path': image_path, 'measures': measures}) # Generate MEI file xml_parser = etree.XMLParser(remove_blank_text=True) mei = etree.fromstring(template, parser=xml_parser) mei_facsimile = mei.xpath('//*[local-name()="facsimile"]')[0] mei_body = mei.xpath('//*[local-name()="body"]')[0] mei_mdiv = etree.Element('mdiv') mei_mdiv.attrib[ '{http://www.w3.org/XML/1998/namespace}id'] = 'mdiv_' + str(uuid4()) mei_mdiv.attrib['n'] = str(1) mei_mdiv.attrib['label'] = '' mei_body.append(mei_mdiv) mei_score = etree.Element('score') mei_score.append(etree.Element('scoreDef')) mei_mdiv.append(mei_score) mei_section = etree.Element('section') mei_score.append(mei_section) mei_section.append(etree.Element('pb')) cur_ulx = 0 cur_measure = 1 for p, page in enumerate(pages): image = Image.open(page['path']) image_width, image_height = image.size image.close() measures = page['measures'] print(measures) # TODO: restore this functionality in some other way? # if args.make_images: # draw_boxes(page['path'], measures) mei_surface = etree.Element('surface') mei_surface.attrib[ '{http://www.w3.org/XML/1998/namespace}id'] = 'surface_' + str( uuid4()) mei_surface.attrib['n'] = str(p + 1) mei_surface.attrib['ulx'] = str(0) mei_surface.attrib['uly'] = str(0) mei_surface.attrib['lrx'] = str(image_width - 1) mei_surface.attrib['lry'] = str(image_height - 1) mei_facsimile.append(mei_surface) mei_graphic = etree.Element('graphic') mei_graphic.attrib[ '{http://www.w3.org/XML/1998/namespace}id'] = 'graphic_' + str( uuid4()) mei_graphic.attrib['target'] = os.path.basename(page['path']) mei_graphic.attrib['width'] = str(image_width) mei_graphic.attrib['height'] = str(image_height) mei_surface.append(mei_graphic) for m, measure in enumerate(measures): print(measure) mei_zone = etree.Element('zone') mei_zone_id = 'zone_' + str(uuid4()) mei_zone.attrib[ '{http://www.w3.org/XML/1998/namespace}id'] = mei_zone_id mei_zone.attrib['type'] = 'measure' mei_zone.attrib['ulx'] = str(int(measure['ulx'])) mei_zone.attrib['uly'] = str(int(measure['uly'])) mei_zone.attrib['lrx'] = str(int(measure['lrx'])) mei_zone.attrib['lry'] = str(int(measure['lry'])) mei_surface.append(mei_zone) mei_measure = etree.Element('measure') mei_measure.attrib[ '{http://www.w3.org/XML/1998/namespace}id'] = 'measure_' + str( uuid4()) mei_measure.attrib['n'] = str(cur_measure) mei_measure.attrib['label'] = str(cur_measure) mei_measure.attrib['facs'] = f'#{mei_zone_id}' mei_section.append(mei_measure) cur_measure += 1 if len(measures) > m + 1 and measures[m + 1]['ulx'] < measure['ulx']: mei_section.append(etree.Element('sb')) elif len(measures) <= m + 1: mei_section.append(etree.Element('sb')) mei_section.append(etree.Element('pb')) mei_path = fsm.get_sheet_whole_directory(sheet_name) mei_file_dir = mei_path / "aligned.mei" with open(str(mei_file_dir), 'wb') as file: xml = etree.ElementTree(mei) xml.write(file, encoding='utf-8', pretty_print=True, xml_declaration=True) tqdm.write('Done.')
def callback(channel, method, properties, body): # Decode body and obtain pdf id data = json.loads(body) pdf_id = data['_id'] sheet_collection = db[cfg.col_sheet] # Get PDF sheet entry pdf_sheet = sheet_collection.find_one(ObjectId(pdf_id)) print(pdf_sheet) pdf_sheet_path = Path(pdf_sheet["sheet_path"]) pdf_sheet_name = pdf_sheet_path.stem if not pdf_sheet: raise Exception(f"PDF Sheet under id {pdf_id} does not exist!") # PDF -> JPEG print("Converting PDF to JPEG page images...") # This awkward loop is done to prevent pdf2image from loading the entire PDF into memory # which for some reason costs several gigabytes for large sheets... i = 1 pages = [] img_pages_path = fsm.get_sheet_pages_directory(pdf_sheet_name) while True: try: page = convert_from_path(pdf_sheet_path.absolute(), 300, first_page=i, last_page=i + 1)[0] page_path = img_pages_path / f'page_{i}.jpg' page.save(page_path, 'JPEG') sheet_collection.update_one( {'sheet_path': str(pdf_sheet_path)}, {'$push': { 'pages_path': str(page_path) }}) del page print(f"{i} pages out of {len(pages)}") except: print("Reached end of PDF") break i += 1 print("PDF conversion finished succesfully!") # JPEG -> MEI if cfg.skip_measure_detection and fsm.skeleton_exists(pdf_sheet_name): print("Using pre-existing skeleton, skipping measure detection...") else: print("Converting JPEG pages to MEI skeleton via measure detector...") to_mei.run(pdf_sheet_name, connection) # Update sheet on mongo mei_path = fsm.get_sheet_whole_directory(pdf_sheet_name) / "aligned.mei" sheet_collection.update_one({'_id': ObjectId(pdf_id)}, {'$push': { 'mei_path': str(mei_path) }}) # Output name to sheet queue status_update_msg = { '_id': pdf_id, 'module': 'measure_detector', 'status': 'complete', 'name': pdf_sheet_name } channel.basic_publish(exchange='', routing_key=cfg.mq_omr_planner_status, body=json.dumps(status_update_msg)) channel.basic_ack(method.delivery_tag) print( f"Published PDF->MEI converted sheet {pdf_sheet_name} to message queue!" )
def run(sheet_name, connection): version = '1.0.0' template = f'''<?xml version="1.0" encoding="UTF-8"?> <mei xmlns="http://www.music-encoding.org/ns/mei"> <meiHead> <fileDesc> <titleStmt> <title/> </titleStmt> <pubStmt/> </fileDesc> <encodingDesc> <appInfo> <application isodate="{datetime.datetime.now().replace(microsecond=0).isoformat()}" version="{version}"> <name>MeasureDetector</name> <p>Measures detected with MeasureDetector</p> </application> </appInfo> </encodingDesc> </meiHead> <music> <facsimile> </facsimile> <body> </body> </music> </mei>'''.encode() # Detect measures page_path = fsm.get_sheet_pages_directory(sheet_name) image_paths = sorted( [str(p.resolve()) for p in page_path.iterdir() if p.is_file()], key=lambda x: int(os.path.basename(x).split('_')[1].split('.')[0])) results = [] tqdm.write(f'Detecting measures in {len(image_paths)} images...') for image_path in tqdm(image_paths, unit='img'): page = detector.detect_measures(image_path) results.append({'path': image_path, 'page': page}) connection.process_data_events() # Generate MEI file xml_parser = etree.XMLParser(remove_blank_text=True) mei = etree.fromstring(template, parser=xml_parser) mei_facsimile = mei.xpath('//*[local-name()="facsimile"]')[0] mei_body = mei.xpath('//*[local-name()="body"]')[0] mei_mdiv = etree.Element('mdiv') mei_mdiv.attrib[ '{http://www.w3.org/XML/1998/namespace}id'] = 'mdiv_' + str(uuid4()) mei_mdiv.attrib['n'] = str(1) mei_mdiv.attrib['label'] = '' mei_body.append(mei_mdiv) mei_score = etree.Element('score') mei_score_def = etree.Element('scoreDef') mei_score.append(mei_score_def) mei_mdiv.append(mei_score) mei_section = etree.Element('section') mei_score.append(mei_section) mei_section.append(etree.Element('pb')) cur_measure, cur_staff = 1, 1 staff_counts = [] section_lengths = [] measures_per_page = [] for p, result in enumerate(results): print("Processing page", page) page, path = result['page'], result['path'] mei_surface = etree.Element('surface') mei_surface.attrib[ '{http://www.w3.org/XML/1998/namespace}id'] = 'surface_' + str( uuid4()) mei_surface.attrib['n'] = str(p + 1) mei_surface.attrib['ulx'] = str(0) mei_surface.attrib['uly'] = str(0) mei_surface.attrib['lrx'] = str(page.width - 1) mei_surface.attrib['lry'] = str(page.height - 1) mei_facsimile.append(mei_surface) mei_graphic = etree.Element('graphic') mei_graphic.attrib[ '{http://www.w3.org/XML/1998/namespace}id'] = 'graphic_' + str( uuid4()) mei_graphic.attrib['target'] = os.path.basename(path) mei_graphic.attrib['width'] = str(page.width) mei_graphic.attrib['height'] = str(page.height) mei_surface.append(mei_graphic) for s, system in enumerate(page.systems): for m, measure in enumerate(system.measures): mei_measure = etree.Element('measure') mei_measure.attrib[ '{http://www.w3.org/XML/1998/namespace}id'] = 'measure_' + str( uuid4()) mei_measure.attrib['n'] = str(cur_measure) mei_measure.attrib['label'] = str(cur_measure) mei_section.append(mei_measure) cur_staff = 1 for st, staff in enumerate(measure.staffs): mei_zone = etree.Element('zone') mei_zone_id = 'zone_' + str(uuid4()) mei_zone.attrib[ '{http://www.w3.org/XML/1998/namespace}id'] = mei_zone_id mei_zone.attrib['type'] = 'staff' mei_zone.attrib['ulx'] = str(int(staff.ulx)) mei_zone.attrib['uly'] = str(int(staff.uly)) mei_zone.attrib['lrx'] = str(int(staff.lrx)) mei_zone.attrib['lry'] = str(int(staff.lry)) mei_surface.append(mei_zone) mei_staff = etree.Element('staff') mei_staff.attrib[ '{http://www.w3.org/XML/1998/namespace}id'] = 'staff_' + str( uuid4()) mei_staff.attrib['n'] = str(cur_staff) mei_staff.attrib['label'] = str(cur_staff) mei_staff.attrib['facs'] = f'#{mei_zone_id}' # Staffs should have at least one layer, can optionally be enumerated with "n" if we need more mei_layer = etree.Element('layer') mei_staff.append(mei_layer) mei_measure.append(mei_staff) cur_staff += 1 staff_counts.append(cur_staff - 1) cur_measure += 1 mei_section.append(etree.Element('sb')) section_lengths.append(cur_measure - 1 - sum(section_lengths)) mei_section.append(etree.Element('pb')) measures_per_page.append(cur_measure - 1 - sum(measures_per_page)) # Add the most likely staff configuration to the scoredef # NOTE: does not generalize to scores with more than one staff configuration mei_staff_group = etree.Element('staffGrp') mei_score_def.append(mei_staff_group) for i in range(round(np.mean(staff_counts))): n = i + 1 mei_staff_def = etree.Element('staffDef') mei_staff_def.attrib['n'] = str(n) mei_staff_def.attrib['lines'] = '5' # Render looks weird without lines mei_staff_group.append(mei_staff_def) # Print some detection statistics print("Detection Statistics:") print(f"{' mean staff count:':<20}{np.mean(np.mean(staff_counts))}") print(f"{' mean line length:':<20}{np.mean(np.mean(section_lengths))}") print( f"{' mean measures per page:':<20}{np.mean(np.mean(measures_per_page))}" ) print(f"{' measures per page:'}") for i, count in enumerate(measures_per_page): page = i + 1 print(f"{' - ' + str(page):<20}{count}") mei_path = fsm.get_sheet_whole_directory(sheet_name) mei_file_dir = mei_path / "aligned.mei" with open(str(mei_file_dir), 'wb') as file: xml = etree.ElementTree(mei) xml.write(file, encoding='utf-8', pretty_print=True) tqdm.write('Done.')
def callback(ch, method, properties, body): data = json.loads(body) sheet_name = data['name'] # Get sheet id sheet_id = str(db[cfg.col_sheet].find_one({"name" : sheet_name})["_id"]) # Github github = Github(cfg.github_token) org = github.get_organization(cfg.github_organization) if cfg.delete_if_exists: try: org.get_repo(sheet_name).delete() print("Deleted existing repo for", sheet_name) except GithubException as e: print("Repo doesn't exist, ready for creation!") print(str(e)) # if "name already exists on this account" in str(e): # TODO: Handling this properly requires offline functionality for the git-repo, meaning we have to # create it without relying on Github and then link it if possible repo = org.create_repo(sheet_name, description=f"Repository for {sheet_name}", auto_init=True) # Git git_dir_path = fsm.get_clean_sheet_git_directory(sheet_name) clone = None tries = 0 while clone==None and tries < 5: try: clone = pygit2.clone_repository(repo.clone_url, str(git_dir_path)) except pygit2.GitError: print("Could not clone repo at:", repo.clone_url, ", trying again in 1 second...") connection.process_data_events() tries += 1 time.sleep(1) status = "complete" if clone != None: clone.remotes.set_url("origin", repo.clone_url) # Add the PDF pdf_path = fsm.get_sheet_whole_directory(sheet_name) / (sheet_name + ".pdf") shutil.copy(str(pdf_path), str(fsm.get_sheet_git_directory(sheet_name))) commit(clone, "Initialize main branch") pushed = False push_tries = 0 while not pushed and push_tries < 5: try: push(clone) pushed = True except pygit2.GitError: print(f"Could not push for score {sheet_name}, retrying in 1 second...") connection.process_data_events() push_tries += 1 time.sleep(1) if pushed: # Add the MEI clone.create_branch(cfg.github_branch, clone.head.peel()) branch = clone.lookup_branch(cfg.github_branch) ref = clone.lookup_reference(branch.name) clone.checkout(ref) mei_path = fsm.get_sheet_whole_directory(sheet_name) / "aligned.mei" shutil.copy(str(mei_path), str(fsm.get_sheet_git_directory(sheet_name))) commit(clone, "Initialize crowd manager branch", branch=cfg.github_branch) pushed_branch = False branch_push_tries = 0 while not pushed_branch and branch_push_tries < 5: try: push(clone, branch=cfg.github_branch, force=True) pushed_branch = True except pygit2.GitError: print(f"Could not push for score {sheet_name}, retrying in 1 second...") connection.process_data_events() branch_push_tries += 1 time.sleep(1) if pushed and pushed_branch: # Protect the newly created/pushed branch and the main branch on Github repo.get_branch("main").edit_protection(user_push_restrictions=[cfg.github_user]) repo.get_branch(cfg.github_branch).edit_protection(user_push_restrictions=[cfg.github_user]) if not pushed_branch: print("Warning, could not push crowd manager's branch for", sheet_name) status = "failed" del branch del ref else: print("Warning, could not push initial commit for", sheet_name) status = "failed" # Clean up (needed since pygit2 tends to leave files in .git open) del clone gc.collect() else: print("Warning, could not initialize repo for", sheet_name) status = "failed" # Update status status_update_msg = { '_id': sheet_id, 'module': 'github_init', 'status': status, 'name': sheet_name } global channel channel.queue_declare(queue=cfg.mq_omr_planner_status) channel.basic_publish(exchange="", routing_key=cfg.mq_omr_planner_status, body=json.dumps(status_update_msg))
def callback(ch, method, properties, body): data = json.loads(body) sheet_name = data['name'] # Get sheet id client = MongoClient(cfg.mongodb_address.ip, cfg.mongodb_address.port) db = client[cfg.db_name] sheet_id = str(db[cfg.col_sheet].find_one({"name" : sheet_name})["_id"]) # Github github = Github(cfg.github_token) org = github.get_organization(cfg.github_organization) if cfg.delete_if_exists: try: org.get_repo(sheet_name).delete() except GithubException as e: print("Repo doesn't exist, ready for creation!") print(str(e)) # if "name already exists on this account" in str(e): # TODO: Handling this properly requires offline functionality for the git-repo, meaning we have to # create it without relying on Github and then link it if possible repo = org.create_repo(sheet_name, description=f"Repository for {sheet_name}", auto_init=True) # Git git_dir_path = fsm.get_clean_sheet_git_directory(sheet_name) clone = pygit2.clone_repository(repo.clone_url, str(git_dir_path)) clone.remotes.set_url("origin", repo.clone_url) # Add the PDF pdf_path = fsm.get_sheet_whole_directory(sheet_name) / (sheet_name + ".pdf") shutil.copy(str(pdf_path), str(fsm.get_sheet_git_directory(sheet_name))) commit(clone, "Initialize main branch") push(clone) # Add the MEI clone.create_branch(cfg.github_branch, clone.head.peel()) branch = clone.lookup_branch(cfg.github_branch) ref = clone.lookup_reference(branch.name) clone.checkout(ref) mei_path = fsm.get_sheet_whole_directory(sheet_name) / "aligned.mei" shutil.copy(str(mei_path), str(fsm.get_sheet_git_directory(sheet_name))) commit(clone, "Initialize crowd manager branch", branch=cfg.github_branch) push(clone, branch=cfg.github_branch) # Protect the newly created/pushed branch and the main branch on Github repo.get_branch("main").edit_protection(user_push_restrictions=[cfg.github_user]) repo.get_branch(cfg.github_branch).edit_protection(user_push_restrictions=[cfg.github_user]) # Clean up (needed since pygit2 tends to leave files in .git open) del clone del branch del ref gc.collect() # Update status status_update_msg = { '_id': sheet_id, 'module': 'github_init', 'status': 'complete', 'name': sheet_name } global channel channel.queue_declare(queue=cfg.mq_omr_planner_status) channel.basic_publish(exchange="", routing_key=cfg.mq_omr_planner_status, body=json.dumps(status_update_msg))