Пример #1
0
def main(argv):
    if len(argv) > 1:
        htmlist = argv[1]
    else:
        htmlist = 'htmlist'

    # Our permanent config for html cleaning
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = False
    config.fetch_images = False
    config.verbose= True

    cleaner = Article(url='', config=config)

    with open(htmlist, 'r') as f:
        htmfile = f.read().split('\n')

    raw = []

    for htm in htmfile:
        print (htm)
        if not htm.endswith("rss.html"):
            with open(htm, 'r') as f:
                h = f.read()

            cleaner.set_html(h)
            cleaner.parse()
            sentences = nlp.split_sentences(cleaner.text)
            #raw.append(sentences])
        
            with open('htm-out', 'a') as f:
                [f.write(r + '\n') for r in sentences]
Пример #2
0
def main(argv):
    sourcelist = []
    if len(argv) > 1:
        sourcefile = argv[1]
        try:
            with open(sourcefile,'r') as f:
                sourcelist = f.read().strip().split('/n')
        except IOError:
            print("File does not exist")

    """
    Check for existence of memo cache
    If it doesn't exist, create memo cache and populate top sources file with the specified sources.txt file. If it is not specified return an error and terminate.
    If memo cache exists, if sources.txt is specified do a check against top sources and add any new ones. If no sources.txt is specified use top sources file.
     """
    firstrun = False
    memo_cache_path = os.path.join(os.path.dirname(__file__), '.memo_cache')
    if not os.path.exists(memo_cache_path):
        if len(sourcelist) > 0:
            firstrun = True
            os.makedirs(memo_cache_path)
            with open(os.path.join(memo_cache_path, '.top_sources'), 'w') as f:
                [f.write(source + '\n') for source in sourcelist]
        else:
            print("You must specify an input file on the first run")
            print("An input file contains line-separated urls to the top-level domains you wish to crawl")
            raise SystemExit
    else:
        if len(sourcelist) > 0:
            with open(os.path.join(memo_cache_path, '.top_sources'), 'w') as f:
                [f.write(source + '\n') for source in sourcelist]

        else:
            with open(os.path.join(memo_cache_path, '.top_sources'), 'r') as f:
                sourcelist = f.read().split('\n')

    # this config applies to the entire crawling process
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = True
    config.fetch_images = False

    top_sources = [IntiSource(url=source,config=config) for source in sourcelist]

    if firstrun:
        build_categories(top_sources)
Пример #3
0
def main(argv):
    TOP_PATH = os.path.dirname(__file__)
    OUT_PATH = os.path.join(TOP_PATH, 'output')
    if not os.path.exists(OUT_PATH):
        os.makedirs(OUT_PATH)

    # Our permanent config for crawling
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = False
    config.fetch_images = False
    config.verbose= True

    # Get contents of our source file
    sourcefile = os.path.join(TOP_PATH, "sources.txt")
    with open(os.path.join(sourcefile), 'r') as f:
        sourcelist = f.read().strip().split('\n')

    # Initialize our sources
    sources = [IntiSource(source,config=config) for source in sourcelist]

    # Make domain directories inside our output path and build sources
    for s in sources:
        if not os.path.exists(os.path.join(OUT_PATH, s.domain)):
            dom_path = os.path.join(OUT_PATH, s.domain)
            os.makedirs(dom_path)

        # Build
        s.build()

        if config.verbose:
            s.print_summary()

    # Multithreaded source downloading and parsing
    news_pool.set(sources, threads_per_source = 4)
    news_pool.join()

    article_parse(sources)