def get_body(self, content): content = re.sub("<ref.*?</ref>", ' ', content) content = re.sub("i.e", '', content) content = re.sub("\.", ' ', content) content = re.sub('[^a-zA-Z0-9 ]', '', content) content = re.sub(' +', ' ', content) return remove_stop_words(stem_tokens(tokenize(content.lower())))
def extract_external_links(self, content): lines=content.split("\n") for i in xrange(len(lines)): if '* [' in lines[i] or '*[' in lines[i]: word = "" temp = lines[i].split(' ') word=[key for key in temp if 'http' not in temp] try: word=' '.join(word).encode('utf-8') self.article.token['external_links'].extend(remove_stop_words(stem_tokens(tokenize(word)))) except: pass
def get_tokens(self, content, title): self.article.token['title'] = tokenize(title) self.article.token['headings'] = tokenize(self.get_headings()) self.article.token['References'] = tokenize(self.get_references(content)) self.article.token['text'] = self.get_body(self.article.content)
def processTitle(title): """ Title is converted to lower case, tokenized and stemmed """ return stem_tokens(tokenize(data.lower()), stemmer)