def match(self, text): matches = set() for m in XML_PATTERN_RE.finditer(text): if ANCHOR_URL_PATTERN_RE.search(m.group(0)) or\ ANCHOR_EMAIL_PATTERN_RE.match(m.group(0)): continue xml_element = m.group(0) offset = m.start() children = get_xml_pair(xml_element, offset, self.priority) matches.add( create_match( (m.start(), m.end(), 'xml element', self.priority), children)) for m in FUZZY_XML_PATTERN_RE.finditer(text): if ANCHOR_URL_PATTERN_RE.match(m.group(0)) or\ ANCHOR_EMAIL_PATTERN_RE.match(m.group(0)): continue xml_element = m.group(0) offset = m.start() children = get_xml_pair(xml_element, offset, self.priority) matches.add( create_match( (m.start(), m.end(), 'xml element', self.priority), children)) return matches
def match(self, text): matches = set() for m in XML_PATTERN_RE.finditer(text): if ANCHOR_URL_PATTERN_RE.search(m.group(0)) or\ ANCHOR_EMAIL_PATTERN_RE.match(m.group(0)): continue xml_element = m.group(0) offset = m.start() children = get_xml_pair(xml_element,offset,self.priority) matches.add( create_match( (m.start(), m.end(), 'xml element', self.priority), children)) for m in FUZZY_XML_PATTERN_RE.finditer(text): if ANCHOR_URL_PATTERN_RE.match(m.group(0)) or\ ANCHOR_EMAIL_PATTERN_RE.match(m.group(0)): continue xml_element = m.group(0) offset = m.start() children = get_xml_pair(xml_element,offset,self.priority) matches.add( create_match( (m.start(), m.end(), 'xml element', self.priority), children)) return matches
def is_xml_lines(lines): xml_lines = 0 empty_lines = 0 confidence = 0.0 lines_size = len(lines) for line in lines: if len(line.strip()) == 0: empty_lines += 1 elif (XML_STRICT_PATTERN1_RE.match(line) or XML_STRICT_PATTERN2_RE.match(line) or XML_STRICT_COMMENT_RE.match(line)) and not \ (ANCHOR_URL_PATTERN_RE.search(line) or \ ANCHOR_EMAIL_PATTERN_RE.search(line)): xml_lines += 1 elif lines_size > 1: if XML_STRICT_OPENING_RE.match(line) or \ XML_STRICT_CLOSING_RE.match(line) or \ line.strip().startswith('<--') or \ line.strip().endswith('-->') or \ XML_ATTRIBUTE_VALUE_PAIR_STRICT_RE.search(line): xml_lines += 1 non_empty_lines = lines_size - empty_lines if non_empty_lines > 0: confidence = float(xml_lines) / (len(lines) - empty_lines) if -0.10 <= (confidence - THRESHOLD_XML) <= 0: logger.info('Almost reached XML threshold: {0}'.format(lines)) return (confidence >= THRESHOLD_XML, confidence)