def main(argv): if len(argv) > 1: htmlist = argv[1] else: htmlist = 'htmlist' # Our permanent config for html cleaning config = Config() config.language = 'id' config.MIN_SENT_COUNT = 20 config.memoize = False config.fetch_images = False config.verbose= True cleaner = Article(url='', config=config) with open(htmlist, 'r') as f: htmfile = f.read().split('\n') raw = [] for htm in htmfile: print (htm) if not htm.endswith("rss.html"): with open(htm, 'r') as f: h = f.read() cleaner.set_html(h) cleaner.parse() sentences = nlp.split_sentences(cleaner.text) #raw.append(sentences]) with open('htm-out', 'a') as f: [f.write(r + '\n') for r in sentences]
def main(argv): sourcelist = [] if len(argv) > 1: sourcefile = argv[1] try: with open(sourcefile,'r') as f: sourcelist = f.read().strip().split('/n') except IOError: print("File does not exist") """ Check for existence of memo cache If it doesn't exist, create memo cache and populate top sources file with the specified sources.txt file. If it is not specified return an error and terminate. If memo cache exists, if sources.txt is specified do a check against top sources and add any new ones. If no sources.txt is specified use top sources file. """ firstrun = False memo_cache_path = os.path.join(os.path.dirname(__file__), '.memo_cache') if not os.path.exists(memo_cache_path): if len(sourcelist) > 0: firstrun = True os.makedirs(memo_cache_path) with open(os.path.join(memo_cache_path, '.top_sources'), 'w') as f: [f.write(source + '\n') for source in sourcelist] else: print("You must specify an input file on the first run") print("An input file contains line-separated urls to the top-level domains you wish to crawl") raise SystemExit else: if len(sourcelist) > 0: with open(os.path.join(memo_cache_path, '.top_sources'), 'w') as f: [f.write(source + '\n') for source in sourcelist] else: with open(os.path.join(memo_cache_path, '.top_sources'), 'r') as f: sourcelist = f.read().split('\n') # this config applies to the entire crawling process config = Config() config.language = 'id' config.MIN_SENT_COUNT = 20 config.memoize = True config.fetch_images = False top_sources = [IntiSource(url=source,config=config) for source in sourcelist] if firstrun: build_categories(top_sources)
def main(argv): TOP_PATH = os.path.dirname(__file__) OUT_PATH = os.path.join(TOP_PATH, 'output') if not os.path.exists(OUT_PATH): os.makedirs(OUT_PATH) # Our permanent config for crawling config = Config() config.language = 'id' config.MIN_SENT_COUNT = 20 config.memoize = False config.fetch_images = False config.verbose= True # Get contents of our source file sourcefile = os.path.join(TOP_PATH, "sources.txt") with open(os.path.join(sourcefile), 'r') as f: sourcelist = f.read().strip().split('\n') # Initialize our sources sources = [IntiSource(source,config=config) for source in sourcelist] # Make domain directories inside our output path and build sources for s in sources: if not os.path.exists(os.path.join(OUT_PATH, s.domain)): dom_path = os.path.join(OUT_PATH, s.domain) os.makedirs(dom_path) # Build s.build() if config.verbose: s.print_summary() # Multithreaded source downloading and parsing news_pool.set(sources, threads_per_source = 4) news_pool.join() article_parse(sources)