def get_description(soup, feed): og_desc = get_og_property(soup, 'description') if og_desc: return og_desc tw_desc = get_twitter_property(soup, 'description') if tw_desc: return tw_desc meta_desc = get_meta_property(soup, 'description') if meta_desc: return meta_desc return clean_html(feed.description)
def get_title(soup, feed): og_title = get_og_property(soup, 'title') if og_title: return og_title tw_title = get_twitter_property(soup, 'title') if tw_title: return tw_title meta_title = get_meta_property(soup, 'title') if meta_title: return meta_title return clean_html(feed.title)
def run_get_abstracts(self): self.updated = datetime.datetime.utcnow() if not self.abstract: r = requests.get("http://doi.org/{}".format(self.id)) text = r.text if "</header>" in text: try: text_after_header = text.split("</header", 1)[1] text_after_p = text_after_header.split( " <p>", 1)[1] clean_text = clean_html(text_after_p) # print clean_text[0:1000] self.abstract = clean_text[0:1000] except IndexError: pass
def article(self): """ Returns a dictionary with the title and paragraphs of the article """ self.soup = BeautifulSoup(self.raw_text, self.html_parser) self._article() if len(self._title) == 0 or len(self._paragraphs) == 0: raise ArticleNotParsable() article = dict() article['title'] = clean_html(self._title[0]) # clean html and remove blank paragaphs article['paragraphs'] = filter(bool, map(clean_html, self._paragraphs)) return article
def clean_description(self): return util.clean_html(self.cleaned_data['description'])
def getEntitiesAndEnrichSourcesSequential(sources, paramsDict): print('\ngetEntities Sequential():') #check/set defaults - start if ('addTitleClass' not in paramsDict): paramsDict['addTitleClass'] = False if ('addTopKTermsFlag' not in paramsDict): paramsDict['addTopKTermsFlag'] = 0 if ('derefSleep' not in paramsDict): paramsDict['derefSleep'] = 0 if ('debugFlag' not in paramsDict): paramsDict['debugFlag'] = False if ('cacheFlag' not in paramsDict): paramsDict['cacheFlag'] = False #check/set defaults - end for source, sourceDict in sources.items(): if (paramsDict['debugFlag'] and paramsDict['cacheFlag']): html = derefURICache(sourceDict['link']) else: html = dereferenceURI(sourceDict['link'], paramsDict['derefSleep']) #set defaults - start setSourceDictDetails(sourceDict) #set defaults - end if (len(html) == 0): continue title = extractPageTitleFromHTML(html) text = clean_html(html) favicon = extractFavIconFromHTML(html, sourceDict['link']) if (len(text) == 0): continue entities2dList = getEntitiesFromText(text) #print('\n\ttitle:', title) #print('\tlink:', sourceDict['link']) #print('\tlen:', len(text.split(' ')), '\n') if (paramsDict['addTitleClass']): entities2dList = entities2dList + getTokenLabelsForText( title, 'TITLE') #add top addTopKTermsFlag terms - start if (paramsDict['addTopKTermsFlag'] > 0): topKTerms = getTopKTermsListFromText( text, paramsDict['addTopKTermsFlag']) allTerms = '' for termCountTup in topKTerms: if (len(termCountTup) != 0): allTerms += termCountTup[0] + ' ' entities2dList = entities2dList + getTokenLabelsForText( allTerms, 'TOP' + str(paramsDict['addTopKTermsFlag']) + 'TERM') #add top addTopKTermsFlag terms - end text = sanitizeText(text) sourceDict['text'] = text sourceDict['title'] = title sourceDict['favicon'] = favicon sourceDict['extraction-time'] = datetime.now().isoformat() sourceDict['entities'] = addDetailsToEntities(entities2dList) return sources
def getEntitiesAndEnrichSources(sources, paramsDict): #NOTE getEntitiesAndEnrichSourcesSequential DUPLICATES FUNCTIONALITY FOR SIMPLICITY #NOTE getEntitiesAndEnrichSourcesSequential DUPLICATES FUNCTIONALITY FOR SIMPLICITY #NOTE getEntitiesAndEnrichSourcesSequential DUPLICATES FUNCTIONALITY FOR SIMPLICITY #NOTE getEntitiesAndEnrichSourcesSequential DUPLICATES FUNCTIONALITY FOR SIMPLICITY #NOTE getEntitiesAndEnrichSourcesSequential DUPLICATES FUNCTIONALITY FOR SIMPLICITY print('\ngetEntities()') #check/set defaults - start if ('addTitleClass' not in paramsDict): paramsDict['addTitleClass'] = False if ('addTopKTermsFlag' not in paramsDict): paramsDict['addTopKTermsFlag'] = 0 if ('derefSleep' not in paramsDict): paramsDict['derefSleep'] = 0 if ('threadPoolCount' not in paramsDict): paramsDict['threadPoolCount'] = 5 if ('debugFlag' not in paramsDict): paramsDict['debugFlag'] = False if ('cacheFlag' not in paramsDict): paramsDict['cacheFlag'] = False #check/set defaults - end if (paramsDict['threadPoolCount'] == 0): return getEntitiesAndEnrichSourcesSequential(sources, paramsDict) print('\tthreadPoolCount:', paramsDict['threadPoolCount']) textColToLabel = [] listOfEntities2dList = [] count = 1 total = len(sources) nerVersion = '' for source, sourceDict in sources.items(): if (paramsDict['debugFlag'] and paramsDict['cacheFlag']): html = derefURICache(sourceDict['link']) else: html = dereferenceURI(sourceDict['link'], paramsDict['derefSleep']) #set defaults - start setSourceDictDetails(sourceDict) #set defaults - end print('\tsource:', source) print('\t', count, 'of', total) count += 1 if (html == ''): continue title = extractPageTitleFromHTML(html) text = clean_html(html) text = sanitizeText(text) favicon = extractFavIconFromHTML(html, sourceDict['link']) print('\thtml.len:', len(html)) print('\ttext.len:', len(text)) print() if (text == ''): continue sourceDict['title'] = title sourceDict['text'] = text sourceDict['favicon'] = favicon textColToLabel.append({ 'textToLabel': text, 'id': source, 'published': sourceDict['published'] }) try: workers = Pool(paramsDict['threadPoolCount']) serverOn = nlpIsServerOn(args.nlp_server_host) if (serverOn): print('\tNER version: 3.8.0') listOfEntities2dList = workers.map(parallelNERNew, textColToLabel) nerVersion = '3.8.0' else: print('\tNER version: old') #use old ner version since new server was not able to be started listOfEntities2dList = workers.map(parallelNER, textColToLabel) nerVersion = 'old' workers.close() workers.join() except: localErrorHandler() return sources for entitiesDetailsDict in listOfEntities2dList: source = entitiesDetailsDict['id'] sources[source]['entities'] = entitiesDetailsDict['entities2dList'] if (paramsDict['addTitleClass']): sources[source]['entities'] += getTokenLabelsForText( sources[source]['title'], 'TITLE') #add top addTopKTermsFlag terms - start if (paramsDict['addTopKTermsFlag'] > 0): topKTerms = getTopKTermsListFromText( sources[source]['text'], paramsDict['addTopKTermsFlag']) allTerms = '' for termCountTup in topKTerms: if (len(termCountTup) != 0): allTerms += termCountTup[0] + ' ' sources[source]['entities'] += getTokenLabelsForText( allTerms, 'TOP' + str(paramsDict['addTopKTermsFlag']) + 'TERM') #add top addTopKTermsFlag terms - end #clear some fields sources[source]['extraction-time'] = datetime.now().isoformat() sources[source]['entities'] = addDetailsToEntities( sources[source]['entities']) return sources, nerVersion
def clean_content(self): return util.clean_html(self.cleaned_data["content"])