예제 #1
0
def keywords_from_url():
    """
    URL : /keywords_from_url
    Extract keywords from the text content of a web page.
    Method : POST
    Form data :
        - url : the url to analyze [string, required]
        - hits : limit number of keywords returned [int, optional, 100 by default]
    Return a JSON dictionary : {"keywords":[list of keywords]}
    """
    # get POST data and load language resources
    data = dict((key, request.form.get(key)) for key in request.form.keys())
    if "url" not in data :
        raise InvalidUsage('No url specified in POST data')

    # crawl url, detect main language and get main text from url
    url_data = url.crawl(data["url"])
    if not url_data :
        raise InvalidUsage('No content to analyze')
    text_content = url.extract_content(url_data.text)

    # analyze text and extract keywords
    keywords = language.keyword_mining(text_content)

    # limit the number of keywords
    total = len(keywords)
    hits = int(data.get("hits", 100))
    keywords = [kw for kw, score in keywords.most_common(hits)]
    return jsonify(keywords=keywords, total=total)
예제 #2
0
def index_job(link):
    """
    Index a single page.
    """
    print("index page : %s" % link)

    # get final url after possible redictions
    try:
        link = url.crawl(link).url
    except:
        return 0

    process = CrawlerProcess({
        'USER_AGENT':
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
        'DOWNLOAD_TIMEOUT': 100,
        'REDIRECT_ENABLED': False,
        'SPIDER_MIDDLEWARES': {
            'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': True
        }
    })
    process.crawl(crawler.SingleSpider,
                  start_urls=[
                      link,
                  ],
                  es_client=client,
                  redis_conn=redis_conn)
    process.start()  # block until finished
예제 #3
0
def explore_job(link):
    """
    Explore a website and index all urls (redis-rq process).
    """
    print("explore website at : %s" % link)

    # get final url after possible redictions
    try:
        link = url.crawl(link).url
    except:
        return 0

    # create or update domain data
    domain = url.domain(link)
    res = client.index(index="web",
                       doc_type='domain',
                       id=domain,
                       body={
                           "homepage": link,
                           "domain": domain,
                           "last_crawl": datetime.now()
                       })

    # start crawler
    process = CrawlerProcess({
        'USER_AGENT':
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
        'DOWNLOAD_TIMEOUT': 100,
        'DOWNLOAD_DELAY': 0.25,
        'ROBOTSTXT_OBEY': True,
        'HTTPCACHE_ENABLED': False,
        'REDIRECT_ENABLED': False,
        'SPIDER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': True,
            'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': True,
            'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': True,
            'scrapy.extensions.closespider.CloseSpider': True
        },
        'CLOSESPIDER_PAGECOUNT': 500  #only for debug
    })
    process.crawl(crawler.Crawler,
                  allowed_domains=[urlparse(link).netloc],
                  start_urls=[
                      link,
                  ],
                  es_client=client,
                  redis_conn=redis_conn)
    process.start()

    return 1
예제 #4
0
def index():
    """
    URL : /index
    Index a new URL in search engine.
    Method : POST
    Form data :
        - url : the url to index [string, required]
    Return a success message.
    """
    # get POST data
    data = dict((key, request.form.get(key)) for key in request.form.keys())
    if "url" not in data:
        raise InvalidUsage('No url specified in POST data')

    # crawl url
    url_data = url.crawl(data["url"])
    if not url_data:
        raise InvalidUsage("URL is invalid or has no text inside")

    # get main language of page
    lang = url.detect_language(url_data.text)
    if lang not in languages:
        raise InvalidUsage('Language not supported')

    # extract title of url
    title = url.extract_title(url_data.text)

    # extract description of url
    description = url.extract_description(url_data.text)

    # extract main content of url
    body = url.extract_content(url_data.text, languages.get(lang))

    # index url and data
    res = client.index(index="web-%s" % lang,
                       doc_type='page',
                       id=data["url"],
                       body={
                           "title": title,
                           "description": description,
                           "body": body,
                           "url": data["url"]
                       })

    return "Success"
예제 #5
0
def reference_job(link, email) :
    """
    Request the referencing of a website.
    """
    print("referencing page %s with email %s"%(link,email))

    # get final url after possible redictions
    try :
        link = url.crawl(link).url
    except :
        return 0

    # create or update domain data
    domain = url.domain(link)
    res = client.index(index="web", doc_type='domain', id=domain, body={
        "homepage":link,
        "domain":domain,
        "email":email
    })

    return 1
예제 #6
0
def explore_job(link):
    """
    Explore a website and index all urls (redis-rq process).
    """
    logging.info("explore website at : %s" % link)

    try:
        link = url.crawl(link).url
    except:
        return 0

    def f(q):
        try:
            """
            __author__      : Bijin Benny
            __email__       : [email protected]
            __license__     : MIT
            __version__     : 1.0
            Modification    : The original code used CrawlerProcess class from
            scrapy library to crawl web pages. However, CrawlerProcess class could
            not run parallely in Redis tasks threads. CrawlerProcess was replaced by
            CrawlerRunner class that could run parallely in multiple Redis tasks
            """
            runner = CrawlerRunner({
                'USER_AGENT':
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
                (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
                'DOWNLOAD_TIMEOUT': 100,
                'DOWNLOAD_DELAY': 0.25,
                'ROBOTSTXT_OBEY': True,
                'HTTPCACHE_ENABLED': False,
                'REDIRECT_ENABLED': False,
                'SPIDER_MIDDLEWARES': {
                    'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware':
                    True,
                    'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':
                    True,
                    'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware':
                    True,
                    'scrapy.extensions.closespider.CloseSpider': True
                },
                'CLOSESPIDER_PAGECOUNT': 500  #only for debug
            })
            runner.crawl(crawler.Crawler,
                         allowed_domains=[urlparse(link).netloc],
                         start_urls=[
                             link,
                         ],
                         es_client=es,
                         redis_conn=redis_conn)
            d = runner.join()
            d.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Q()
    p = Process(target=f, args=(q, ))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result
    return 1