Exemplo n.º 1
0
    def post(self):
        """Post urls for extracting content (note: do not save the result)"""
        result = {'error': False, 'message': ''}
        urls = request.values.get('urls', '')

        urls = [u.strip().lower() for u in urls.split(',') if u]
        if not urls:
            result['error'] = True
            result['message'] = 'Urls is empty'
            return result

        extractor_name = request.values.get('extractor', list_extractor[0])
        s_extractor = get_extractor(extractor_name)
        if not extractor:
            result['error'] = True
            result[
                'message'] = "The extractor name '%s' does not support yet" % extractor_name
            return result

        # append urls that missing schema
        for idx, url in enumerate(urls):
            if not url.startswith('http'):
                urls[idx] = 'http://' + url

        s_crawler = PageCrawler()
        s_content_getter = ContentGetter(crawler=s_crawler,
                                         extractor=s_extractor)
        result['pages'] = s_content_getter.process(urls)
        return result
Exemplo n.º 2
0
    def __init__(self, *args, **kwargs):
        super(MyTestCase, self).__init__(*args, **kwargs)
        self.main_url = "http://flask.pocoo.org/docs/0.10/deploying/wsgi-standalone/"
        self.sub_urls = [
            "http://flask.pocoo.org/docs/0.10/deploying/wsgi-standalone/"
        ]

        self.urls = self.sub_urls + [self.main_url]
        self.crawler = PageCrawler()
        self.extractor = DragnetPageExtractor()
        self.content_getter = ContentGetter(self.crawler, self.extractor)
        self.es_client = Elasticsearch()
def train_model(urls):
    logger.info('Start train_model...')
    logger.info('Num of train urls: %s' % len(urls))
    result = {}
    # config
    tokenizer = GeneralTokenizer().tokenize
    min_ngram = 1
    max_ngram = 2

    # train
    mg_client = get_mg_client()
    storage = mg_client.web.page
    content_getter_with_storage = ContentGetter(
        PageCrawlerWithStorage(storage), s_extractor)
    modeler = WebPageTypeModeler(urls, content_getter_with_storage,
                                 path.join(model_loc_dir, model_name),
                                 tokenizer, min_ngram, max_ngram)
    ok, msg = modeler.train()
    mg_client.close()

    if not ok:
        result['error'] = True
        result['message'] = msg
        return result

    result[
        'message'] = 'The new model name %s was trained successfully' % model_name
    result['model_name'] = model_name
    result['data'] = msg
    logger.info('End train_model...')
    return result
Exemplo n.º 4
0
    def post(self):
        """Post test set urls and model name for evaluation"""
        result = {'error': False}
        model_name = request.values.get('model_name', '')
        urls = request.values.get('urls', '')
        urls = [u.strip().lower() for u in urls.split(',') if u]
        if not urls:
            result['error'] = True
            result['message'] = 'Urls is empty'
            return result
        list_model = get_list_model()
        if not model_name or model_name not in list_model:
            result['error'] = True
            result[
                'message'] = 'Model name is invalid, please select one of below models'
            result['models'] = list_model
            return result

        # append urls that missing schema
        for idx, url in enumerate(urls):
            if not url.startswith('http'):
                urls[idx] = 'http://' + url

        unlabeled_data = check_unlabeled_data(urls)
        if unlabeled_data:
            result['error'] = True
            result[
                'message'] = 'Please label all urls firstly, unlabeled data: %s' % ', '.join(
                    unlabeled_data)
            return result

        extractor_name = request.values.get('extractor', list_extractor[0])
        s_extractor = get_extractor(extractor_name)
        if not extractor:
            result['error'] = True
            result[
                'message'] = "The extractor name '%s' does not support yet" % extractor_name
            return result

        mg_client = get_mg_client()
        storage = mg_client.web.page
        s_crawler = PageCrawlerWithStorage(storage)
        s_content_getter = ContentGetter(crawler=s_crawler,
                                         extractor=s_extractor)
        s_classifier = PredictWebPageType(model_loc_dir,
                                          model_name,
                                          s_content_getter,
                                          evaluate_mode=True)
        if classifier.get_current_model() != model_name:
            s_classifier.web_page_type_classifier = None
        else:
            s_classifier.web_page_type_classifier = classifier.web_page_type_classifier
            s_classifier.labels = classifier.web_page_type_classifier.named_steps[
                'clf'].classes_

        evaluation = WebPageTypeModelEvaluation(urls, storage, s_classifier)
        result.update(evaluation.evaluate())
        result['model_name'] = model_name
        mg_client.close()
        return result
def crawl_pages(input_file, output_file):
    logger.info('Start processing input %s...' % input_file)
    with open(input_file, 'r') as f:
        list_url = [
            re.sub(r'\n', '', u.strip())
            for u in random.sample(f.readlines(), 1000)
        ]

    page_crawler = PageCrawler()
    page_extractor = DragnetPageExtractor()
    content_getter = ContentGetter(page_crawler, page_extractor)
    result = content_getter.process(list_url)
    with open(output_file, 'w') as f:
        data = json.dumps(result, f).encode('utf-8', errors='ignore')
        f.write(data)

    logger.info('End processing input %s...' % input_file)
def evaluate_model(urls):
    logger.info('Start evaluate_model...')
    logger.info('Num of test urls: %s' % len(urls))
    result = {'error': False}
    mg_client = get_mg_client()
    storage = mg_client.web.page
    s_crawler = PageCrawlerWithStorage(storage)
    s_content_getter = ContentGetter(crawler=s_crawler, extractor=s_extractor)
    s_classifier = PredictWebPageType(model_loc_dir,
                                      model_name,
                                      s_content_getter,
                                      evaluate_mode=True)

    evaluation = WebPageTypeModelEvaluation(urls, storage, s_classifier)
    result.update(evaluation.evaluate())
    result['model_name'] = model_name
    mg_client.close()
    logger.info('End evaluate_model...')
    return result
Exemplo n.º 7
0
from parser.content_getter import ContentGetter
from parser.crawler import PageCrawler
from parser.extractor import GooseDragnetPageExtractor
from pprint import pprint

FIELD_KEYWORD = 'Keyword'
FIELD_URL = 'Landing Page'
FIELD_URL_PAGE_CONTENT = 'Landing Page Content'
FIELD_URL_CRAWL_STATUS = 'Crawl Status'
FIELD_URL_TYPE = 'Url Type'
URL_TYPE_WEB = 'Web'
URL_TYPE_NEWS = 'News'

crawler = PageCrawler()
extractor = GooseDragnetPageExtractor()
content_getter = ContentGetter(crawler=crawler, extractor=extractor)

if __name__ == '__main__':
    url_file = 'data/top_10_ranking_keywords.xlsx'
    df = pd.read_excel(url_file)
    urls = set()
    for idx, row in df.iterrows():
        url = row[FIELD_URL]
        urls.add(url)
        # if idx == 5:
        #     break

    url_page_contents = content_getter.process(urls)
    for idx, row in df.iterrows():
        url = row[FIELD_URL]
        crawled_page = url_page_contents.get(url)
Exemplo n.º 8
0
class MyTestCase(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super(MyTestCase, self).__init__(*args, **kwargs)
        self.main_url = "http://flask.pocoo.org/docs/0.10/deploying/wsgi-standalone/"
        self.sub_urls = [
            "http://flask.pocoo.org/docs/0.10/deploying/wsgi-standalone/"
        ]

        self.urls = self.sub_urls + [self.main_url]
        self.crawler = PageCrawler()
        self.extractor = DragnetPageExtractor()
        self.content_getter = ContentGetter(self.crawler, self.extractor)
        self.es_client = Elasticsearch()

    def test_crawler(self):
        result = self.crawler.process(self.urls)
        pprint(result)

    def test_extractor(self):
        pprint(self.extractor.process(self.crawler.process(self.urls)))

    def test_all_text_extractor(self):
        self.extractor = AllTextPageExtractor()
        pprint(
            self.extractor.process(
                self.crawler.process([
                    'https://www.uncommongoods.com/gifts/personalized/personalized-gifts'
                ])))

    def test_all_text_extractor2(self):
        self.extractor = AllTextPageExtractor()
        pprint(
            self.extractor.process(
                self.crawler.process([
                    'https://vnexpress.net/tin-tuc/thoi-su/8-nguoi-chet-hon-tram-nghin-ngoi-nha-bi-toc-mai-do-bao-doksuri-3642317.html'
                ])))

    def test_get_text_from_url(self):
        urls = [
            'https://www.uncommongoods.com/gifts/personalized/personalized-gifts',
            'https://stackoverflow.com/questions/1521592/get-root-domain-of-link',
            'https://docs.python.org/2/library/urlparse.html'
        ]

        for url in urls:
            print get_text_from_url(url)

    def test_content_getter(self):
        result = self.content_getter.process(self.urls)
        pprint(result)

    def test_cosine_similarity(self):
        similarity = CosineSimilarity(self.content_getter, self.es_client)
        result = similarity.process(self.main_url, self.sub_urls)
        pprint(result)

    def _call_api(self, i):
        params = {
            'distance_metric': 'cosine',
            'main_url': self.main_url,
            'sub_urls': ', '.join(self.sub_urls)
        }
        response = requests.post(
            'http://107.170.109.238:8888/similarity/check', data=params)
        print i

    def test_api(self):
        params = {
            'distance_metric': 'cosine',
            'main_url': self.main_url,
            'sub_urls': ', '.join(self.sub_urls)
        }
        pool = Pool(4)
        pool.map(self._call_api, range(2000))

    def test_similarity_function(self):
        from similarity_checker import cosine_similarity, jaccard_similarity, fuzzy_similarity, simhash_similarity
        tokens_1 = 'This is a foo ba'.split()
        tokens_2 = 'This sentence is similar to a foo bar sentence'.split()
        pprint('jaccard: %s' % jaccard_similarity(tokens_1, tokens_2))
        pprint('cosine: %s' % cosine_similarity(tokens_1, tokens_2))
        pprint('fuzzy: %s' % fuzzy_similarity(tokens_1, tokens_2))
        pprint('simhash: %s' % simhash_similarity(tokens_1, tokens_2))

    def test_tokenizer(self):
        from similarity_checker import tokenize_and_normalize_content
        url = 'https://www.travelocity.com/Las-Vegas-Hotels-MGM-Grand-Hotel-Casino.h12628.Hotel-Information'
        page = self.content_getter.process([url])
        pprint(tokenize_and_normalize_content(page[url]['content']))

    def test_tokenize_and_normalize(self):
        from similarity_checker import tokenize_and_normalize_content
        text = 'what are you doing'
        pprint(
            tokenize_and_normalize_content(text,
                                           unit='character',
                                           min_ngram=1,
                                           max_ngram=3))
Exemplo n.º 9
0
    def post(self):
        """Post web pages to extract content"""
        result = {'error': False, 'pages': []}
        unit = request.values.get('unit', 'word')
        min_ngram = int(request.values.get('min_ngram', 1))
        max_ngram = int(request.values.get('max_ngram', 1))
        urls = request.values.get('urls', '')
        strip_chars = ' "\''
        urls = [
            u.strip(strip_chars) for u in urls.split(',')
            if u.strip(strip_chars)
        ]
        if not urls:
            result['error'] = 'urls must not be empty'

        extractor_name = request.values.get('extractor', list_extractor[0])
        s_extractor = get_extractor(extractor_name)
        if not extractor:
            result[
                'error'] = "The extractor name '%s' does not support yet" % extractor_name
            return result
        if extractor_name == 'selective':
            s_extractor.selector_type = request.values.get(
                'selector_type', list_extractor[0])
            selector = request.values.get('selector')
            if not selector or not selector.strip():
                result[
                    'error'] = "You must specify the 'selector' element when the 'extractor' is 'selective'"
                return result
            s_extractor.selector = selector.strip()

        user_agent = request.values.get('user_agent', user_agents[0])
        page_load_timeout = request.values.get('page_load_timeout',
                                               page_load_timeout_default)
        wait_after_last_request = request.values.get(
            'wait_after_last_request', wait_after_last_request_default)
        s_crawler = PageCrawler(
            user_agent=user_agent.strip(),
            page_load_timeout=page_load_timeout,
            wait_after_last_request=wait_after_last_request)

        cache = int(request.values.get('cache', 0))
        if cache != 0:
            expire_time = int(request.values.get('expire_time',
                                                 604800))  # Seconds = 7 days
            s_crawler.active_redis_cache(expire_time)

        s_content_getter = ContentGetter(crawler=s_crawler,
                                         extractor=s_extractor)

        if not result['error']:
            pages = result['pages']
            for url, page in s_content_getter.process(urls).items():
                page['tokens'] = tokenize_and_normalize_content(
                    page['content'],
                    unit=unit,
                    min_ngram=min_ngram,
                    max_ngram=max_ngram)
                pages.append((url, page))

        return jsonify(result)
Exemplo n.º 10
0
    def post(self):
        """Post web pages to check similarity percentage"""
        result = {'error': False, 'similarity': []}
        # get request params
        unit = request.values.get('unit', 'word')
        min_ngram = int(request.values.get('min_ngram', 1))
        max_ngram = int(request.values.get('max_ngram', 1))
        similarity_checker.unit = unit
        similarity_checker.min_ngram = min_ngram
        similarity_checker.max_ngram = max_ngram
        distance_metric = request.values.get('distance_metric', '')
        if not distance_metric:
            similarity_checker.similarity = cosine_similarity
        elif distance_metric not in distance_metrics:
            result['error'] = 'distance_metric must be in %s' % ', '.join(
                distance_metrics)
            return result

        elif distance_metric == 'jaccard':
            similarity_checker.similarity = jaccard_similarity

        elif distance_metric == 'cosine':
            similarity_checker.similarity = cosine_similarity

        elif distance_metric == 'fuzzy':
            similarity_checker.similarity = fuzzy_similarity

        elif distance_metric == 'simhash':
            similarity_checker.similarity = simhash_similarity

        url_1 = request.values.get('url_1', '')
        url_2 = request.values.get('url_2', '')
        url_3 = request.values.get('url_3', '')
        if not url_1:
            result['error'] = 'url_1 must not blank'
            return result

        if not url_2:
            result['error'] = 'url_2 must not blank'
            return result

        if not url_3:
            result['error'] = 'url_3 must not blank'
            return result

        extractor_name = request.values.get('extractor', list_extractor[0])
        s_extractor = get_extractor(extractor_name)
        if not extractor:
            result[
                'error'] = "The extractor name '%s' does not support yet" % extractor_name
            return result
        url_1_selector = None
        url_2_selector = None
        url_3_selector = None
        if extractor_name == 'selective':
            s_extractor.selector_type = request.values.get(
                'selector_type', list_extractor[0])
            url_1_selector = request.values.get('url_1_selector')
            url_2_selector = request.values.get('url_2_selector')
            url_3_selector = request.values.get('url_3_selector')
            if not url_1_selector or not url_1_selector.strip():
                result['error'] = "You must specify the 'url_1_selector' element when the 'extractor' " \
                                  "is 'selective'"
                return result
            if not url_2_selector or not url_2_selector.strip():
                result[
                    'error'] = "You must specify the 'url_2_selector' element when the 'extractor' is 'selective'"
                return result
            if not url_3_selector or not url_3_selector.strip():
                result[
                    'error'] = "You must specify the 'url_3_selector' element when the 'extractor' is 'selective'"
                return result

        user_agent = request.values.get('user_agent', user_agents[0])
        page_load_timeout = request.values.get('page_load_timeout',
                                               page_load_timeout_default)
        wait_after_last_request = request.values.get(
            'wait_after_last_request', wait_after_last_request_default)
        s_content_getter = ContentGetter(crawler=PageCrawler(
            user_agent=user_agent.strip(),
            page_load_timeout=page_load_timeout,
            wait_after_last_request=wait_after_last_request),
                                         extractor=s_extractor)

        # check similarity
        if not result['error']:
            similarity_checker.content_getter = s_content_getter
            similarity_checker.url_1_selector = url_1_selector
            similarity_checker.url_2_selector = url_2_selector
            similarity_checker.url_3_selector = url_3_selector
            sims = similarity_checker.cross_process(url_1, url_2, url_3)
            if sims:
                result['similarity'] = sims

        return jsonify(result)
Exemplo n.º 11
0
    def post(self):
        """Post web pages to check similarity percentage"""
        result = {'error': False, 'similarity': []}
        # get request params
        unit = request.values.get('unit', 'word')
        min_ngram = int(request.values.get('min_ngram', 1))
        max_ngram = int(request.values.get('max_ngram', 1))
        similarity_checker.unit = unit
        similarity_checker.min_ngram = min_ngram
        similarity_checker.max_ngram = max_ngram
        distance_metric = request.values.get('distance_metric', '')
        if not distance_metric:
            similarity_checker.similarity = cosine_similarity
        elif distance_metric not in distance_metrics:
            result['error'] = 'distance_metric must be in %s' % ', '.join(
                distance_metrics)
            return result

        elif distance_metric == 'jaccard':
            similarity_checker.similarity = jaccard_similarity

        elif distance_metric == 'cosine':
            similarity_checker.similarity = cosine_similarity

        elif distance_metric == 'fuzzy':
            similarity_checker.similarity = fuzzy_similarity

        elif distance_metric == 'simhash':
            similarity_checker.similarity = simhash_similarity

        main_url = request.values.get('main_url', '')
        sub_url_string = request.values.get('sub_urls', '')
        strip_chars = ' "\''
        sub_urls = [
            u.strip(strip_chars) for u in sub_url_string.split(',')
            if u.strip(strip_chars)
        ]
        if not main_url:
            result['error'] = 'main_url must not blank'
            return result

        if not sub_urls:
            result['error'] = 'sub_urls must not blank'
            return result

        # validate params type
        if type(sub_urls) is not list:
            result['error'] = 'sub_urls must be in array type'
            return result

        extractor_name = request.values.get('extractor', list_extractor[0])
        s_extractor = get_extractor(extractor_name)
        if not extractor:
            result[
                'error'] = "The extractor name '%s' does not support yet" % extractor_name
            return result
        main_page_selector = None
        sub_page_selector = None
        if extractor_name == 'selective':
            s_extractor.selector_type = request.values.get(
                'selector_type', list_extractor[0])
            main_page_selector = request.values.get('main_page_selector')
            sub_page_selector = request.values.get('sub_page_selector')
            if not main_page_selector or not main_page_selector.strip():
                result['error'] = "You must specify the 'main_page_selector' element when the 'extractor' " \
                                  "is 'selective'"
                return result
            if not sub_page_selector or not sub_page_selector.strip():
                result[
                    'error'] = "You must specify the 'sub_page_selector' element when the 'extractor' is 'selective'"
                return result

        user_agent = request.values.get('user_agent', user_agents[0])
        page_load_timeout = request.values.get('page_load_timeout',
                                               page_load_timeout_default)
        wait_after_last_request = request.values.get(
            'wait_after_last_request', wait_after_last_request_default)
        s_content_getter = ContentGetter(crawler=PageCrawler(
            user_agent=user_agent.strip(),
            page_load_timeout=page_load_timeout,
            wait_after_last_request=wait_after_last_request),
                                         extractor=s_extractor)

        # check similarity
        if not result['error']:
            similarity_checker.content_getter = s_content_getter
            if main_page_selector:
                similarity_checker.main_page_selector = main_page_selector.strip(
                )
                similarity_checker.sub_page_selector = sub_page_selector.strip(
                )
            sims = similarity_checker.process(main_url=main_url,
                                              sub_urls=sub_urls)
            if sims:
                result['similarity'] = sims
            else:
                result['error'] = 'Main page is empty'

        return jsonify(result)
Exemplo n.º 12
0
from flask import request, jsonify
from flask_restplus import Api, Resource, fields

from app import app
from parser.content_getter import ContentGetter
from parser.crawler_cluster import PageCrawlerCluster as PageCrawler
from parser.extractor import DragnetPageExtractor, ReadabilityPageExtractor, GoosePageExtractor, \
    GooseDragnetPageExtractor, SelectivePageExtractor, AllTextPageExtractor
from similarity_checker import SimilarityChecker, jaccard_similarity, cosine_similarity, \
    fuzzy_similarity, simhash_similarity, tokenize_and_normalize_content

api = Api(app, doc='/doc/', version='1.0', title='Web pages similarity')

crawler = PageCrawler()
extractor = DragnetPageExtractor()
content_getter = ContentGetter(crawler=crawler, extractor=extractor)
similarity_checker = SimilarityChecker(content_getter=content_getter,
                                       similarity=cosine_similarity)

list_extractor = [
    'dragnet', 'goose', 'goose_dragnet', 'readability', 'selective', 'all_text'
]


def get_extractor(name):
    if name == 'dragnet':
        return DragnetPageExtractor()
    elif name == 'readability':
        return ReadabilityPageExtractor()
    elif name == 'goose':
        return GoosePageExtractor()
Exemplo n.º 13
0
    def post(self):
        """Post web page urls to train new model"""
        result = {'error': False, 'message': ''}
        urls = request.values.get('urls', '')
        urls = [u.strip().lower() for u in urls.split(',') if u]
        if not urls:
            result['error'] = True
            result['message'] = 'Urls is empty'
            return result

        extractor_name = request.values.get('extractor', list_extractor[0])
        s_extractor = get_extractor(extractor_name)
        if not extractor:
            result['error'] = True
            result[
                'message'] = "The extractor name '%s' does not support yet" % extractor_name
            return result

        model_name = request.values.get(
            'model_name',
            time.strftime(self.date_time_format) +
            '_page_type_classifier.model')
        if model_name in get_list_model():
            result['error'] = True
            result[
                'message'] = "The model name '%s' is duplicated, please select another model name." % model_name
            return result

        tokenizer_name = request.values.get('tokenizer', list_tokenizer[0])
        if not tokenizer_name:
            result['error'] = True
            result['message'] = 'Tokenizer is empty'
            return result

        tokenizer = get_tokenizer(tokenizer_name)
        if not tokenizer:
            result['error'] = True
            result['message'] = "Tokenizer name '%s' is not supported, please choose one of these tokenizer name: %s" \
                                % (tokenizer_name, ', '.join(list_tokenizer))
            return result

        min_ngram = request.values.get('min_ngram', '1')
        max_ngram = request.values.get('max_ngram', '2')

        try:
            min_ngram = int(min_ngram)
            max_ngram = int(max_ngram)
        except ValueError:
            result['error'] = True
            result['message'] = 'Max ngram and min ngram must be integer'
            return result

        # append urls that missing schema
        for idx, url in enumerate(urls):
            if not url.startswith('http'):
                urls[idx] = 'http://' + url

        unlabeled_data = check_unlabeled_data(urls)
        if unlabeled_data:
            result['error'] = True
            result[
                'message'] = 'Please label all urls firstly, unlabeled data: %s' % ', '.join(
                    unlabeled_data)
            return result

        mg_client = get_mg_client()
        storage = mg_client.web.page
        content_getter_with_storage = ContentGetter(
            PageCrawlerWithStorage(storage), s_extractor)
        modeler = WebPageTypeModeler(urls, content_getter_with_storage,
                                     path.join(model_loc_dir, model_name),
                                     tokenizer, min_ngram, max_ngram)
        ok, msg = modeler.train()
        mg_client.close()
        if not ok:
            result['error'] = True
            result['message'] = msg
            return result

        result[
            'message'] = 'The new model name %s was trained successfully' % model_name
        result['model_name'] = model_name
        result['data'] = msg
        return result