re_matcher = re.compile("^https?://.*ics.uci.edu") def get_links(html): links = [] soup = BeautifulSoup(html, "html.parser") for link in soup.findAll('a', attrs={'href': re_matcher}): links.append(link.get('href')) return links def hasdigit(token): return any(c.isdigit() for c in token) stopw = Stopwords() def check_token(token): return not stopw.is_stop(token) and not hasdigit( token) and len(token) > 1 and len(token) < 20 def add_token(token): pass nonalphanum = re.compile("[^0-9a-z']") def tokenize_text(intext):
from zope.component.testing import setUp from index import Index from parsers.english import EnglishParser from splitter import SplitterFactory from stopwords import Stopwords from zopyx.txng3.core.interfaces import IParser, IStopwords, IThesaurus from zopyx.txng3.core.lexicon import LexiconFactory from zopyx.txng3.core.storage import StorageWithTermFrequencyFactory from zopyx.txng3.core.thesaurus import GermanThesaurus # Setup environment setUp() provideUtility(SplitterFactory, IFactory, 'txng.splitters.default') provideUtility(EnglishParser(), IParser, 'txng.parsers.en') provideUtility(Stopwords(), IStopwords, 'txng.stopwords') provideUtility(LexiconFactory, IFactory, 'txng.lexicons.default') provideUtility(StorageWithTermFrequencyFactory, IFactory, 'txng.storages.default') provideUtility(GermanThesaurus, IThesaurus, 'txng.thesaurus.de') try: import readline histfile = os.path.expanduser('~/.pyhist') readline.read_history_file(histfile) atexit.register(readline.write_history_file, histfile) except: pass class Text: