def walkSiblings(self, node): currentSibling = Parser.previousSibling(node) b = [] while currentSibling is not None: b.append(currentSibling) currentSibling = Parser.previousSibling(currentSibling) return b
def getMetaLang(self, article): """\ Extract content languages from metas """ # we have a lang attribute in html meta_langs = [] attr = Parser.getAttribute(article.doc, attr='lang') if attr is not None: meta_langs += attr.replace(' ','').lower().split(',') # look up for a Content-Language in meta attrs = { 'http-equiv':'content-language', 'name':'lang', 'name':'og:lang', } head = article.doc.find('head') if head is not None: metas = Parser.getElementsByTag(head, tag='meta') for meta in metas: for attr in attrs: if meta.attrib.get(attr,'').lower().startswith(attrs[attr]): langs = meta.attrib.get('content',None) if langs is not None: meta_langs += langs.replace(' ','').lower().split(',') if 'lang' in meta.attrib: meta_langs += meta.attrib['lang'].replace(' ','').lower().split(',') result = [] for lang in meta_langs: lang = lang[:2] if re.search(RE_LANG, lang): result.append(lang) return result
def removeNodesViaRegEx(self, doc, pattern): for selector in ["id", "class"]: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughtyList = doc.xpath(reg, namespaces={"re": self.regexpNS}) for node in naughtyList: Parser.remove(node) return doc
def test_childNodesWithText(self): html = '<html><body>' html += '<p>this is a test <a class="link">link</a> and this is <strong class="link">strong</strong></p>' html += '<p>this is a test and this is <strong class="link">strong</strong></p>' html += '</body></html>' doc = Parser.fromstring(html) p = Parser.getElementsByTag(doc, tag='p')[0]
def getMetaLang(self, article): """\ Extract content language from meta """ # we have a lang attribute in html attr = Parser.getAttribute(article.doc, attr='lang') if attr is None: # look up for a Content-Language in meta items = [ {'tag': 'meta', 'attr': 'http-equiv', 'value': 'content-language'}, {'tag': 'meta', 'attr': 'name', 'value': 'lang'} ] for item in items: meta = Parser.getElementsByTag(article.doc, **item) if meta: attr = Parser.getAttribute(meta[0], attr='content') break if attr: value = attr[:2] if re.search(RE_LANG, value): self.language = value.lower() return value.lower() return None
def getH1(self, article): """\ Fetch the article H1 tag """ """\ Searching last h1 tag before main article text begins """ h1 = '' if article.topNode is not None: lastTag = '' for i in article.doc.cssselect('[rel=topnode], h1'): if i.tag != 'h1': break lastTag = i h1 = Parser.getText(lastTag) # H1 into main article if lastTag == '': for i in article.doc.cssselect('[rel=topnode], h1'): if i.tag == 'h1': lastTag = i break else: # Get first H1 tag h1Elem = Parser.getElementsByTag(article.doc, tag='h1') """ no h1 found """ if h1Elem is None or len(h1Elem) == 0: return h1 h1 = Parser.getText(h1Elem[0]) return h1
def postExtractionCleanup(self, targetNode): """\ remove any divs that looks like non-content, clusters of links, or paras with no gusto """ if targetNode.text is not None: e = Parser.createElement(text=targetNode.text) targetNode.text = None targetNode.insert(0, e) node = self.addSiblings(targetNode) for e in node: if e.tag in ('h2','h3','h4'): continue if e.tag not in ('p','pre','font'): textLen,stopCount,isHighLink = self.getTextStats(e) if isHighLink \ or self.isTableTagAndNoParagraphsExist(e) \ or not self.isNodeScoreThreshholdMet(node, e): Parser.remove(e) for e in reversed(node): if e.tag not in ('h2','h3','h4'): break Parser.remove(e) return node
def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs): """\ adds any siblings that may have a decent score to this node """ if currentSibling.tag == 'p' and len(Parser.getText(currentSibling)) > 0: e0 = currentSibling if e0.tail: e0 = deepcopy(e0) e0.tail = '' return [e0] else: potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p') if potentialParagraphs is None: return None else: ps = [] for firstParagraph in potentialParagraphs: text = Parser.getText(firstParagraph) if len(text) > 0: wordStats = self.stopwordsCls(language=self.language).getStopWordCount(text) paragraphScore = wordStats.getStopWordCount() siblingBaseLineScore = float(.30) highLinkDensity = self.isHighLinkDensity(firstParagraph) score = float(baselineScoreForSiblingParagraphs * siblingBaseLineScore) if score < paragraphScore and not highLinkDensity: p = Parser.createElement(tag='p', text=text, tail=None) ps.append(p) return ps
def get_siblings_content(self, current_sibling, baselinescore_siblings_para): """\ adds any siblings that may have a decent score to this node """ if current_sibling.tag == 'p' and len(Parser.getText(current_sibling)) > 0: e0 = current_sibling if e0.tail: e0 = deepcopy(e0) e0.tail = '' return [e0] else: potential_paragraphs = Parser.getElementsByTag(current_sibling, tag='p') if potential_paragraphs is None: return None else: ps = [] for first_paragraph in potential_paragraphs: text = Parser.getText(first_paragraph) if len(text) > 0: word_stats = self.stopwords_class(language=self.language).get_stopword_count(text) paragraph_score = word_stats.get_stopword_count() sibling_baseline_score = float(.30) high_link_density = self.is_highlink_density(first_paragraph) score = float(baselinescore_siblings_para * sibling_baseline_score) if score < paragraph_score and not high_link_density: p = Parser.createElement(tag='p', text=text, tail=None) ps.append(p) return ps
def get_siblings_score(self, top_node): """\ we could have long articles that have tons of paragraphs so if we tried to calculate the base score against the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of 100 then 100 should be our base. """ base = 100000 paragraphs_number = 0 paragraphs_score = 0 nodes_to_check = Parser.getElementsByTag(top_node, tag='p') for node in nodes_to_check: text_node = Parser.getText(node) word_stats = self.stopwords_class(language=self.language).get_stopword_count(text_node) high_link_density = self.is_highlink_density(node) if word_stats.get_stopword_count() > 2 and not high_link_density: paragraphs_number += 1 paragraphs_score += word_stats.get_stopword_count() if paragraphs_number > 0: base = paragraphs_score / paragraphs_number return base
def is_highlink_density(self, e): """\ checks the density of links within a node, is there not much text and most of it contains linky shit? if so it's no good """ links = Parser.getElementsByTag(e, tag='a') if links is None or len(links) == 0: return False text = Parser.getText(e) words = text.split(' ') words_number = float(len(words)) sb = [] for link in links: sb.append(Parser.getText(link)) linkText = ''.join(sb) linkWords = linkText.split(' ') numberOfLinkWords = float(len(linkWords)) numberOfLinks = float(len(links)) linkDivisor = float(numberOfLinkWords / words_number) score = float(linkDivisor * numberOfLinks) if score >= 1.0: return True return False
def getMetaLang(self, article): """\ Extract content language from meta """ # we have a lang attribute in html attr = Parser.getAttribute(article.doc, attr='lang') if attr is None: # look up for a Content-Language in meta items = [{ 'tag': 'meta', 'attr': 'http-equiv', 'value': 'content-language' }, { 'tag': 'meta', 'attr': 'name', 'value': 'lang' }] for item in items: meta = Parser.getElementsByTag(article.doc, **item) if meta: attr = Parser.getAttribute(meta[0], attr='content') break if attr: value = attr[:2] if re.search(RE_LANG, value): self.language = value.lower() return value.lower() return None
def test_tostring(self): html = '<html><body>' html += '<p>this is a test <a>link</a> and this is <strong>strong</strong></p>' html += '</body></html>' doc = Parser.fromstring(html) result = Parser.nodeToString(doc) self.assertEqual(html, result)
def convertDivsToParagraphs(self, doc, domType): badDivs = 0 elseDivs = 0 convertedTextNodes = 0 divs = Parser.getElementsByTag(doc, tag=domType) replaceNodesList = {} divIndex = 0 errors = [] goods = [] regexps = [] selectors = [] tags = ['a','blockquote','dl','div','img','ol','p','pre','table','ul'] for div in divs: items = Parser.getElementsByTags(div, tags) if div is not None and len(items) == 0: self.replaceElementsWithPara(doc, div) badDivs += 1 elif div is not None: replaceNodes = self.getReplacementNodes(doc, div) div.clear() for c, n in enumerate(replaceNodes): div.insert(c, n) elseDivs +=1 return doc
def getH1(self, article): """\ Fetch the article H1 tag """ """\ Searching last h1 tag before main article text begins """ h1 = '' if article.topNode is not None: lastTag = '' for i in article.doc.cssselect('[rel=topnode], h1'): if i.tag != 'h1': break lastTag = i h1 = Parser.getText(i) if lastTag == '': for i in article.doc.cssselect('[rel=topnode] h1'): h1 = Parser.getText(i) break else: # Get first H1 tag h1Elem = article.doc.find('.//h1') """ no h1 found """ if h1Elem is None: return h1 h1 = Parser.getText(h1Elem) return h1
def checkForOpenGraphTag(self): """\ checks to see if we were able to find open graph tags on this page """ node = self.article.rawDoc meta = Parser.getElementsByTag(node, tag='meta', attr='property', value='og:image') for item in meta: href = Parser.getAttribute(item, attr='content') if href: mainImage = Image() mainImage.imageSrc = href mainImage.imageExtractionType = "opengraph" mainImage.confidenceScore = 100 locallyStoredImage = self.getLocallyStoredImage( mainImage.imageSrc) if locallyStoredImage: mainImage.bytes = locallyStoredImage.bytes mainImage.height = locallyStoredImage.height mainImage.width = locallyStoredImage.width return mainImage return None
def getBaselineScoreForSiblings(self, topNode): """\ we could have long articles that have tons of paragraphs so if we tried to calculate the base score against the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of 100 then 100 should be our base. """ base = 100000 numberOfParagraphs = 0 scoreOfParagraphs = 0 nodesToCheck = Parser.getElementsByTag(topNode, tag='p') for node in nodesToCheck: nodeText = Parser.getText(node) wordStats = self.stopwordsCls(language=self.language).getStopWordCount(nodeText) highLinkDensity = self.isHighLinkDensity(node) if wordStats.getStopWordCount() > 2 and not highLinkDensity: numberOfParagraphs += 1 scoreOfParagraphs += wordStats.getStopWordCount() if numberOfParagraphs > 0: base = scoreOfParagraphs / numberOfParagraphs return base
def remove_nodes_regex(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughty_list = doc.xpath(reg, namespaces={'re': self.regexp_namespace}) for node in naughty_list: Parser.remove(node) return doc
def checkForLinkTag(self): """\ checks to see if we were able to find open link_src on this page """ node = self.article.rawDoc meta = Parser.getElementsByTag(node, tag='link', attr='rel', value='image_src') for item in meta: href = Parser.getAttribute(item, attr='href') if href: mainImage = Image() mainImage.imageSrc = href mainImage.imageExtractionType = "linktag" mainImage.confidenceScore = 100 locallyStoredImage = self.getLocallyStoredImage( mainImage.imageSrc) if locallyStoredImage: mainImage.bytes = locallyStoredImage.bytes mainImage.height = locallyStoredImage.height mainImage.width = locallyStoredImage.width return mainImage return None
def clean_em_tags(self, doc): ems = Parser.getElementsByTag(doc, tag='em') for node in ems: images = Parser.getElementsByTag(node, tag='img') if len(images) == 0: node.drop_tag() return doc
def getBaselineScoreForSiblings(self, topNode): """\ we could have long articles that have tons of paragraphs so if we tried to calculate the base score against the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of 100 then 100 should be our base. """ base = 100000 numberOfParagraphs = 0 scoreOfParagraphs = 0 nodesToCheck = Parser.getElementsByTag(topNode, tag='p') for node in nodesToCheck: nodeText = Parser.getText(node) wordStats = StopWords( language=self.language).getStopWordCount(nodeText) highLinkDensity = self.isHighLinkDensity(node) if wordStats.getStopWordCount() > 2 and not highLinkDensity: numberOfParagraphs += 1 scoreOfParagraphs += wordStats.getStopWordCount() if numberOfParagraphs > 0: base = scoreOfParagraphs / numberOfParagraphs return base
def removeNodesViaRegEx(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughtyList = cache.xpath(reg, doc, namespaces={'re':self.regexpNS}) for node in naughtyList: Parser.remove(node) return doc
def cleanEmTags(self, doc): ems = Parser.getElementsByTag(doc, tag='em') for node in ems: images = Parser.getElementsByTag(node, tag='img') if len(images) == 0: node.drop_tag() return doc
def isHighLinkDensity(self, e): """\ checks the density of links within a node, is there not much text and most of it contains linky shit? if so it's no good """ links = Parser.getElementsByTag(e, tag='a') if links is None or len(links) == 0: return False text = Parser.getText(e) words = text.split(' ') numberOfWords = float(len(words)) sb = [] for link in links: sb.append(Parser.getText(link)) linkText = ''.join(sb) linkWords = linkText.split(' ') numberOfLinkWords = float(len(linkWords)) numberOfLinks = float(len(links)) linkDivisor = float(numberOfLinkWords / numberOfWords) score = float(linkDivisor * numberOfLinks) if score >= 1.0: return True return False
def cleanEmTags(self, doc): ems = Parser.getElementsByTag(doc, tag="em") for node in ems: images = Parser.getElementsByTag(node, tag="img") if len(images) == 0: node.drop_tag() return doc
def clean(self, article): docToClean = article.doc nodelist = self.getNodesToDelete(docToClean) for node in nodelist: Parser.remove(node) docToClean = self.removeListsWithLinks(docToClean) docToClean = self.convertDivsToParagraphs(docToClean, ('div','dl','article')) return docToClean
def walk_siblings(self, node): current_sibling = Parser.previousSibling(node) b = [] while current_sibling is not None: b.append(current_sibling) previousSibling = Parser.previousSibling(current_sibling) current_sibling = None if previousSibling is None else previousSibling return b
def dropTags(self, doc, tags): for tag in tags: ems = Parser.getElementsByTag(doc, tag=tag) for node in ems: images = Parser.getElementsByTag(node, tag='img') if len(images) == 0: node.drop_tag() return doc
def walkSiblings(self, node): currentSibling = Parser.previousSibling(node) b = [] while currentSibling is not None: b.append(currentSibling) previousSibling = Parser.previousSibling(currentSibling) currentSibling = None if previousSibling is None else previousSibling return b
def replaceTagsWithText(self): """\ replace common tags with just text so we don't have any crazy formatting issues so replace <br>, <i>, <strong>, etc.... with whatever text is inside them code : http://lxml.de/api/lxml.etree-module.html#strip_tags """ Parser.stripTags(self.getTopNode(), 'b', 'strong', 'i', 'br')
def convertToText(self,article): text = Parser.getFormattedText(self.topNode) lines = text.split(u'\n') good_lines = [] for line in lines: if re.search('[^ \xa0]',line): good_lines.append(line.strip()) text = u'\n'.join(good_lines) Parser.adjustTopNode(article) return text
def removeNodesViaRegEx(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughtyList = cache.xpath(reg, doc, namespaces={'re': self.regexpNS}) for node in naughtyList: Parser.remove(node) return doc
def isTableTagAndNoParagraphsExist(self, e): return False subParagraphs = Parser.getElementsByTag(e, tag='p') for p in subParagraphs: txt = Parser.getText(p) if len(txt) < 25: Parser.remove(p) if not Parser.hasChildTag(e, 'p') and e.tag is not "td": return True return False
def is_table_and_no_para_exist(self, e): subParagraphs = Parser.getElementsByTag(e, tag='p') for p in subParagraphs: txt = Parser.getText(p) if len(txt) < 25: Parser.remove(p) subParagraphs2 = Parser.getElementsByTag(e, tag='p') if len(subParagraphs2) == 0 and e.tag is not "td": return True return False
def isTableTagAndNoParagraphsExist(self, e): subParagraphs = Parser.getElementsByTag(e, tag='p') for p in subParagraphs: txt = Parser.getText(p) if len(txt) < 25: Parser.remove(p) subParagraphs2 = Parser.getElementsByTag(e, tag='p') if len(subParagraphs2) == 0 and e.tag is not "td": return True return False
def test_striptags(self): html = '<html><body>' html += '<p>this is a test <a>link</a> and this is <strong>strong</strong></p>' html += '</body></html>' expected = '<html><body>' expected += '<p>this is a test link and this is strong</p>' expected += '</body></html>' doc = Parser.fromstring(html) Parser.stripTags(doc, 'a', 'strong') result = Parser.nodeToString(doc) self.assertEqual(expected, result)
def removeNodesWithNegativeScores(self): """\ if there are elements inside our top node that have a negative gravity score, let's give em the boot """ return gravityItems = self.topNode.cssselect("*[gravityScore]") for item in gravityItems: score = int(item.attrib.get('gravityScore'), 0) if score < 1: Parser.remove(item)
def post_cleanup(self, targetNode): """\ remove any divs that looks like non-content, clusters of links, or paras with no gusto """ node = self.add_siblings(targetNode) for e in node.getchildren(): if e.tag != 'p': if self.is_highlink_density(e) \ or self.is_table_and_no_para_exist(e) \ or not self.is_nodescore_threshold_met(node, e): Parser.remove(e) return node
def test_getElementsByTags(self): html = '<html><body>' html += '<p>this is a test <a class="link">link</a> and this is <strong class="link">strong</strong></p>' html += '<p>this is a test and this is <strong class="link">strong</strong></p>' html += '</body></html>' doc = Parser.fromstring(html) elements = Parser.getElementsByTags(doc, ['p', 'a', 'strong']) self.assertEqual(len(elements), 5) # find childs within the first p p = Parser.getElementsByTag(doc, tag='p')[0] elements = Parser.getElementsByTags(p, ['p', 'a', 'strong']) self.assertEqual(len(elements), 2)
def postExtractionCleanup(self, targetNode): """\ remove any divs that looks like non-content, clusters of links, or paras with no gusto """ node = self.addSiblings(targetNode) for e in node.getchildren(): if e.tag != 'p': if self.isHighLinkDensity(e) \ or self.isTableTagAndNoParagraphsExist(e) \ or not self.isNodeScoreThreshholdMet(node, e): Parser.remove(e) return node
def getDepthLevel(self, node, parentDepth, siblingDepth): MAX_PARENT_DEPTH = 2 if parentDepth > MAX_PARENT_DEPTH: return None else: siblingNode = Parser.previousSibling(node) if siblingNode is not None: return DepthTraversal(siblingNode, parentDepth, siblingDepth + 1) elif node is not None: parent = Parser.getParent(node) if parent is not None: return DepthTraversal(parent, parentDepth + 1, 0) return None
def is_boostable(self, node): """\ alot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it """ para = "p" steps_away = 0 minimum_stopword_count = 5 max_stepsaway_from_node = 3 nodes = self.walk_siblings(node) for current_node in nodes: # p if current_node.tag == para: if steps_away >= max_stepsaway_from_node: return False paraText = Parser.getText(current_node) word_stats = self.stopwords_class(language=self.language).get_stopword_count(paraText) if word_stats.get_stopword_count() > minimum_stopword_count: return True steps_away += 1 return False
def remove_drop_caps(self, doc): items = Parser.css_select( doc, "span[class~=dropcap], span[class~=drop_cap]") for item in items: item.drop_tag() return doc
def isOkToBoost(self, node): """\ alot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it """ para = "p" stepsAway = 0 minimumStopWordCount = 5 maxStepsAwayFromNode = 3 nodes = self.walkSiblings(node) for currentNode in nodes: # p if currentNode.tag == para: if stepsAway >= maxStepsAwayFromNode: return False paraText = Parser.getText(currentNode) wordStats = StopWords( language=self.language).getStopWordCount(paraText) if wordStats.getStopWordCount > minimumStopWordCount: return True stepsAway += 1 return False
def convertToText(self): txts = [] for node in list(self.getTopNode()): txt = Parser.getText(node) if txt: txt = HTMLParser().unescape(txt) txts.append(innerTrim(txt)) return '\n\n'.join(txts)
def extract_tags(self, article): node = article.doc # node doesn't have chidren if len(list(node)) == 0: return NO_STRINGS elements = Parser.css_select(node, A_REL_TAG_SELECTOR) if elements is None: return NO_STRINGS tags = [] for el in elements: tag = Parser.getText(el) if tag: tags.append(tag) return set(tags)
def checkForKnownElements(self): """\ in here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have * known places to look for good images. * TODO: enable this to use a series of settings files so people can define what the image ids/classes are on specific sites """ domain = self.getCleanDomain() if domain in self.customSiteMapping.keys(): classes = self.customSiteMapping.get(domain).split('|') for classname in classes: KNOWN_IMG_DOM_NAMES.append(classname) knownImage = None for knownName in KNOWN_IMG_DOM_NAMES: known = Parser.getElementById(self.article.rawDoc, knownName) if not known: known = Parser.getElementsByTag(self.article.rawDoc, attr='class', value=knownName) if known: known = known[0] if known: mainImage = Parser.getElementsByTag(known, tag='img') if mainImage: knownImage = mainImage[0] if knownImage is not None: knownImgSrc = Parser.getAttribute(knownImage, attr='src') mainImage = Image() mainImage.imageSrc = self.buildImagePath(knownImgSrc) mainImage.imageExtractionType = "known" mainImage.confidenceScore = 90 locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc) if locallyStoredImage: mainImage.bytes = locallyStoredImage.bytes mainImage.height = locallyStoredImage.height mainImage.width = locallyStoredImage.width return mainImage
def getNodesToCheck(self, doc): """\ returns a list of nodes we want to search on like paragraphs and tables """ nodesToCheck = [] for tag in ['p', 'pre', 'td']: items = Parser.getElementsByTag(doc, tag=tag) nodesToCheck += items return nodesToCheck
def remove_negativescores_nodes(self): """\ if there are elements inside our top node that have a negative gravity score, let's give em the boot """ gravity_items = Parser.css_select(self.top_node, "*[gravityScore]") for item in gravity_items: score = int(item.attrib.get('gravityScore'), 0) if score < 1: item.getparent().remove(item)
def removeScriptsAndStyles(self, doc): # remove scripts scripts = Parser.getElementsByTag(doc, tag='script') for item in scripts: Parser.remove(item) # remove styles styles = Parser.getElementsByTag(doc, tag='style') for item in styles: Parser.remove(item) # remove comments comments = Parser.getComments(doc) for item in comments: Parser.remove(item) return doc
def getTitle(self, article): """\ Fetch the article title and analyze it """ title = '' doc = article.doc titleElem = Parser.getElementsByTag(doc, tag='title') # no title found if titleElem is None or len(titleElem) == 0: return title # title elem found titleText = Parser.getText(titleElem[0]) usedDelimeter = False # split title with | if '|' in titleText: titleText = self.doTitleSplits(titleText, PIPE_SPLITTER) usedDelimeter = True # split title with - if not usedDelimeter and '-' in titleText: titleText = self.doTitleSplits(titleText, DASH_SPLITTER) usedDelimeter = True # split title with » if not usedDelimeter and u'»' in titleText: titleText = self.doTitleSplits(titleText, ARROWS_SPLITTER) usedDelimeter = True # split title with : if not usedDelimeter and ':' in titleText: titleText = self.doTitleSplits(titleText, COLON_SPLITTER) usedDelimeter = True title = MOTLEY_REPLACEMENT.replaceAll(titleText) return title