def __md_parse(xml_doc, text): """ Return the DOM from multiline markdown text. Headers (#..., ##..., ###...) are interpreted as nodes name and the following text as the content of these nodes. /!\ plusieurs noeuds de même nom au même niveau ??? """ lines = text.splitlines() context = xml_doc if LIB.sanitize(lines[0]) == "---": # Markdown with yaml meta-data block yaml_block = "" for i in range(1, len(lines)): if LIB.sanitize(lines[i]) == "---": break yaml_block += lines[i] + "\n" meta_dict = yaml.load(yaml_block) XML.add_dict_as_xml(context, meta_dict) raw_content_text = "" for j in range(i + 1, len(lines)): raw_content_text += lines[j] + "\n" node_raw = ET.SubElement(context, "raw_content") node_raw.text = str(raw_content_text) else: # Structured Markdown cur_level = 0 code_fence_on = False for i in range(len(lines)): line = lines[i] match_code_fence = re.match(r"^ *```", line) if match_code_fence: code_fence_on = not code_fence_on match_title = re.match(r"^ *(#+)(.+)", line) if match_title and not code_fence_on: new_level = len(match_title.group(1)) node_txt = match_title.group(2) node_name = __extract_node_name(node_txt) attributes = __extract_attributes(node_txt) if new_level > cur_level + 1: meta = XML.xpath_node(xml_doc, ".//_meta") XML.add_error(meta, "__md_parse()", "fatal", "md_level_mismatch", "level mismatch", i, line) else: for lev in range(new_level, cur_level + 1): context = context.getparent() context = ET.SubElement(context, node_name, attributes) cur_level = new_level else: context.text = (context.text if context.text else "") + line + "\n" node_raw = ET.SubElement(xml_doc, "raw_content") node_raw.text = text return xml_doc
def doc(root_dir, name): """ Parse a Markdown or CSV document file and return DOM. """ full_path = os.path.join(root_dir, name) input_file = codecs.open(full_path, mode="r", encoding="utf-8") dir, file = os.path.split(name) forget, ext = os.path.splitext(name) ext = ext[1:].lower() xml_doc = ET.Element('document', { "path": name, "dir": dir, "file": file, "ext": ext }) meta = ET.SubElement(xml_doc, "_meta") if (ext == LIB.CSV_EXT): try: dom = __csv_parse(xml_doc, input_file.read()) except: dom = __bin_parse(xml_doc, None) elif (ext == LIB.MD_EXT): #try: dom = __md_parse(xml_doc, input_file.read()) #except: dom = __bin_parse(xml_doc, None) else: # other parsed as binary dom = __bin_parse(xml_doc, None) #build @ref, @title, @icon, @cat attributes for root_node in XML.xpath_list(dom, "*[name][not(@is_template)]"): ref = __extract_ref(XML.xpath_plain(root_node, "name/text()")) if (ref > ""): title = LIB.sanitize(XML.xpath_plain(root_node, "title/text()")) icon = __extract_image(XML.xpath_plain(root_node, "text()")) cat = "CAT-" + root_node.tag root_node.set("ref", ref) root_node.set("title", title) root_node.set("icon", icon) root_node.set("cat", cat) #find all distinct ref patterns text = " ".join(root_node.xpath(".//text()")) ext_refs = set(re.findall('\[`([A-Z]{2,6}-[\w_]+)`\]', text)) for ext_ref in ext_refs: if (ext_ref != ref): XML.add_external_ref(meta, ext_ref, ref) else: XML.add_error(meta, "doc()", "warning", "ref_error", "document reference syntax error", 0, ref) return dom
def __extract_node_name(txt): """ Return txt with only lowercase and any non alphanumerical caracters replaced by '_'. Stop at first '@'. """ txt = re.sub(r"@.*$", "", txt) txt = LIB.sanitize(txt).lower() txt = re.sub(r"\W+", "_", txt) txt = "_" + txt if re.match("^\d", txt) else txt return txt
def __extract_attributes(txt): """ Return a dictionary of attributes from '@key=value' patterns in txt. """ txt = re.sub(r"^[^@]*", "", txt).strip("@") groups = txt.split("@") groups = [LIB.sanitize(a).lower() for a in groups] key_vals = [(g.split("=") + [""])[0:2] for g in groups if __is_valid_name(g.split("=")[0])] attribs = dict(key_vals) return attribs
corpusdir + f for f in os.listdir(corpusdir) if f[-7:] == '.tar.gz' ] for t in tarballs: with tarfile.open(t, 'r:gz') as t: t.extractall(path=tmp) #list relevant txt files reg = re.compile('^[A-Z]$') corpusfiles = [tmp + f for f in os.listdir(tmp) if reg.match(f)] #store all sentences in one list (already sanitized and lowercased) corpus = [] for f in corpusfiles: with open(f, 'r') as f: for line in f.readlines(): corpus.append(sanitize(line.lower())) #delete temp subdir rmtree(tmp) ## RANDOM-PICK LINES AND NONCES ## #prepare vars srcrange = range(len(corpus)) #no. of lines in BNC outrange = range(Corpora.size) #no. of desired lines in output nums = [] nonces = [] out = [] #load pos-tagger (to avoid selecting punctuation as nonces) nlp = spacy.load('en')
#!/usr/bin/env python3 """Takes a dump of the 1st sentences of Wikipedia pages; then random-picks a set no. of nonce :: context pairs therefrom, purges invalid entries, and formats them so that they can be parsed by vectorize.py. """ import random, spacy from conf import Corpora from lib import getline_inf, sanitize ## PREPARE SOURCE CORPUS ## with open(Corpora.inf.srcdir + 'wiki_src.txt', 'r') as src: src = src.readlines() #store lines in list src = [sanitize(line.lower()) for line in src] #sanitize and lowercase ## FILTER OUT INVALID ENTRIES ## #lambda: lines coming from disambiguation wiki pages disamb = lambda x : 'may refer to' in x or 'may stand for' in x \ or 'can refer to' in x or 'can stand for' in x \ or 'might refer to' in x or 'might stand for' in x \ or 'also refer to' in x or 'also stand for' in x #lambda: empty lines (nonce without definition) empty = lambda x: len(x) <= 1 or '::' in x[-5:] #filter out src = [line for line in src if not (disamb(line) or empty(line))] ## RANDOM-PICK LINES THEN OUTPUT ##