Пример #1
0
from django.db import models
from filter.models import BlackList
from django.db.models import Q

from utils.helpers import get_domain_from_url, clean_url

from scrapyd_api import ScrapydAPI

from datetime import timedelta
import logging
logger = logging.getLogger(__name__)
# connect scrapyd service
localhost = 'http://localhost:6800'
scrapyd = ScrapydAPI(localhost)


class Domains(models.Model):
    domain = models.TextField(max_length=200, unique=True)
    url = models.URLField()
    # parent, child, grandchild ...
    level = models.SmallIntegerField(default=0)
    src_domain = models.TextField(max_length=200, null=True, default='manual')
    # crawl info
    fullscan = models.BooleanField(null=True, default=False)
    status = models.TextField(default='created', max_length=10)
    infoscan = models.BooleanField(null=True, default=False)
    externalscan = models.BooleanField(null=True, default=False)
    created_at = models.DateTimeField(auto_now_add=True)
    updated_at = models.DateTimeField(auto_now=True)

    def save(self, *args, **kwargs):
Пример #2
0
def run_spider():
    scrapyd = ScrapydAPI(env('SCRAPYD', default='http://0.0.0.0:6800'))
    scrapyd.schedule('default', spider='pubgshowcase')
Пример #3
0
from django.views.generic import View  #class based views inherit from View
from django.views.generic.base import TemplateView  #needed for class-view templates
from scrapyd_api import ScrapydAPI
#from django.utils import URLUtil
from .models import *
from .forms import *
from scrapy.utils.project import get_project_settings
from dal import autocomplete
import time

from .sentiment_script_custom import clean_article
from .sentiment_script_custom import sentiment_analysis
# Create your views here.

# connect scrapyd service
scrapyd = ScrapydAPI(
    'http://localhost:6800')  #how do I make api calls to here?
#in shell go to stockCrawl/stockBot/stockBot and type scrapyd to launch twisted to port 6800
#latest version of scrapyd-client pip install git+https://github.com/scrapy/scrapyd-client


def landing(request):
    dow_url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol=DJI&outputsize=compact&apikey=TJ86LY8QFCFMQ44Z&datatype=json&interval=15min'
    sp500_url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol=INX&outputsize=compact&apikey=TJ86LY8QFCFMQ44Z&datatype=json&interval=15min'
    nasdaq_url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol=IXIC&outputsize=compact&apikey=TJ86LY8QFCFMQ44Z&datatype=json&interval=15min'
    test = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=AAPL&outputsize=compact&apikey=TJ86LY8QFCFMQ44Z&datatype=json&interval=15min"

    form = searchStockForm
    context = {'form': form}

    return render(request, 'stockSite/landing.html', context)
Пример #4
0
from .models import Bitcoin, Ethereum, Ethereumclassic, Bitcoincash, Litecoin, Eos, Neo, Bitshares
from django.core import serializers
import json, datetime
from django.http import Http404, HttpResponse
from django.db.models import Max

from uuid import uuid4
from django.http import JsonResponse
from scrapyd_api import ScrapydAPI

import threading, time

scrapyd_ip = 'localhost'
scrapyd_port = 6800
scrapyd_url = f'http://{scrapyd_ip}:{scrapyd_port}'
scrapyd = ScrapydAPI('http://localhost:6800')
print(scrapyd.list_projects())


def test(request):
    return render(request, '../templates/gallery.html')


# def schedule(project,spider):
#     url = scrapyd_url + f'/schedule.json'
#     params = {
#         "project":project,
#         "spider":spider
#     }
#     r = requests.post(url,data = params)
#     return r.json()
Пример #5
0
def get_scrapyd(client):
    if not client.auth:
        return ScrapydAPI(scrapyd_url(client.ip, client.port))
    return ScrapydAPI(scrapyd_url(client.ip, client.port),
                      auth=(client.username, client.password))
Пример #6
0
from urllib.parse import urlparse
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
from django.views.decorators.http import require_POST, require_http_methods
from rest_framework.decorators import api_view
from django.shortcuts import render
from django.http import JsonResponse
from django.views.decorators.csrf import csrf_exempt
from scrapyd_api import ScrapydAPI
# from .utils import URLUtil
from .models import ScrapyItem, Quote
from rest_framework.views import APIView
import os
# Create your views here.
# scrapyd = ScrapydAPI('http://localhost:6800')
scrapyd = ScrapydAPI('http://0.0.0.0:' + str(os.environ.get("PORT", 6800)))


def is_valid_url(url):
    validate = URLValidator()
    try:
        validate(url)  # check if url format is valid
    except ValidationError:
        return False

    return True


@csrf_exempt
@api_view(['GET', 'POST'])
# @require_http_methods(['POST', 'GET']) # only get and post
Пример #7
0
from django.shortcuts import render
from .models import ScrapyModel
from .forms import NameForm
from django.http import JsonResponse
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
from django.views.decorators.http import require_POST, require_http_methods
from scrapyd_api import ScrapydAPI
from .BERT_NER import Bert_NER
from .Review_Sentiment import Review_Sentiment
import os
import pandas as pd
import numpy as np


scrapyd = ScrapydAPI('http://*****:*****@require_http_methods(['POST', 'GET'])
Пример #8
0
def crawl(request):
    '''Memulai proses crawling berita'''
    # Crawling hanya diterima dengan method POST
    scrapyd = ScrapydAPI('http://localhost:6800')
    if request.method == 'POST':

        url = request.POST.get('url', None)  #Mengambil url yang diberikan
        website = request.POST.get('website', None)
        #Cek apakah benar benar url
        if not url:
            return JsonResponse({'error': 'Missing  args'})

        # Cek apakah url valid
        if not is_valid_url(url):
            return JsonResponse({'error': 'URL is invalid'})

        # Cek apakah url sudah
        # if linkURL.objects.filter(link=url).exists():
        # return JsonResponse({'error': 'URL sudah tersimpan dalam database'})
        # else:
        # print("Hello World")
        # d, created = linkURL.objects.get_or_create(link=url)
        # if created :
        # 	d.save()

        domain = urlparse(url).netloc  # parse the url and extract the domain
        unique_id = str(uuid4())  # create a unique ID.
        '''Custom Setting untuk scraping'''
        settings = {
            'unique_id':
            unique_id,  # unique ID 
            'USER_AGENT':
            'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
        }

        # Here we schedule a new crawling task from scrapyd.
        # Notice that settings is a special argument name.
        # But we can pass other arguments, though.
        # This returns a ID which belongs and will be belong to this task
        # We are goint to use that to check task's status.
        task = scrapyd.schedule("default",
                                website,
                                settings=settings,
                                url=url,
                                domain=domain)

        return JsonResponse({
            'task_id': task,
            'unique_id': unique_id,
            'status': 'started'
        })

    elif request.method == 'GET':
        '''Untuk mengecek status crawling'''
        # We were passed these from past request above. Remember ?
        # They were trying to survive in client side.
        # Now they are here again, thankfully. <3
        # We passed them back to here to check the status of crawling
        # And if crawling is completed, we respond back with a crawled data.
        task_id = request.GET.get('task_id', None)
        unique_id = request.GET.get('unique_id', None)
        url = request.GET.get('url', None)

        if not task_id or not unique_id:
            return JsonResponse({'error': 'Missing args'})

        # Here we check status of crawling that just started a few seconds ago.
        # If it is finished, we can query from database and get results
        # If it is not finished we can return active status
        # Possible results are -> pending, running, finished
        status = scrapyd.job_status("default", task_id)
        '''Jika status = finished, berhenti cek status'''
        if status == 'finished':
            # d, created = linkURL.objects.get_or_create(link=url)
            # if created :
            # d.save()
            return JsonResponse({'data': url, 'status': 'finished'})
        else:
            return JsonResponse({'status': status})
Пример #9
0
 def __init__(self, server_url):
     self.server_url = server_url
     self.scrapyd_api = ScrapydAPI(server_url)
Пример #10
0
from django.contrib import auth
from django.contrib.auth.decorators import user_passes_test
from django.http import *
from django.shortcuts import render
from scrapyd_api import ScrapydAPI
import requests
from evaluate.YGT import train, predict

SCRAPYD_URL = 'http://localhost:6800'
PROJECT_NAME = 'lianjia'
#PROJECT_NAME='default'
SPIDER_NAME = 'lianjia_spider'

scrapyd = ScrapydAPI(SCRAPYD_URL)

# Create your views here.


def index(request):
    return render(request, "myadmin/index.html")


def loginCheck(func):
    def wrapper(request, *args, **kwargs):
        is_login = request.session.get('IS_LOGIN', False)
        if True:
            ret = func(request, *args, **kwargs)
            return ret
        else:
            return JsonResponse({"status": "not login"})
Пример #11
0
def GetTwitterCrawlerStatus(request):
    print("Twitter Ajax Calling")
    _index = 0
    status = False
    global SpiderTwitterJOBID
    try:
        # global scrapyd
        scrapyd = ScrapydAPI('http://127.0.0.1:6800')
        if SpiderTwitterJOBID != 'SpiderTwitterKey':
            state = scrapyd.job_status(SCRAPYD_PROJECT_NAME,
                                       SpiderTwitterJOBID)
            print("Twitter JOBID = " + SpiderTwitterJOBID)
            print("Twitter JOB State = " + state)
            if state == RUNNING or state == PENDING:
                status = True
            else:
                status = False
    except ConnectionError:
        status = False
    response = []
    item = []
    score = []
    id = 0
    if status == True:
        _index = request.GET.get('index', None)
        _historyKey = request.GET.get('historyKey', None)
        print("DB Index = " + str(_index) + " and History key = " +
              str(_historyKey))
        result = TwitterHistory.objects.using('SentimentAppDB').filter(
            id__gt=_index, historykey=_historyKey).values()
        if len(list(result)) != 0:
            for resCrawl in result:
                res = list(
                    Score.objects.using('SentimentAppDB').filter(
                        id=resCrawl['scoreid_id']).values())

                sTextBlob = res[0]['ScoreTextBlob']
                scoreTextBlob = "{" + sTextBlob + "}"
                dt = json.loads(scoreTextBlob)
                if float(dt['polarity']) >= 0.3:
                    textBlobResult = {
                        'value': "positive",
                        'score': str(dt['polarity'])
                    }
                elif float(dt['polarity']) <= -0.3:
                    textBlobResult = {
                        'value': "negative",
                        'score': str(dt['polarity'])
                    }
                else:
                    textBlobResult = {
                        'value': "neutral",
                        'score': str(dt['polarity'])
                    }
                res[0]['ScoreTextBlob'] = textBlobResult

                sVader = res[0]['ScoreVader']
                scoreVader = "{" + sVader + "}"
                d = json.loads(scoreVader)
                if float(d['comp']) >= 0.3:
                    vaderResult = {
                        'value': "positive",
                        'score': str(d['comp'])
                    }
                elif float(d['comp']) <= -0.3:
                    vaderResult = {
                        'value': "negative",
                        'score': str(d['comp'])
                    }
                else:
                    vaderResult = {'value': "neutral", 'score': str(d['comp'])}
                res[0]['ScoreVader'] = vaderResult

                sGoogleNLP = res[0]['ScoreGoogleNLP']
                scoreGoogleNLP = "{" + sGoogleNLP + "}"
                da = json.loads(scoreGoogleNLP)
                if float(da['score']) >= 0.3:
                    googleNLPResult = {
                        'value': "positive",
                        'score': str(da['score'])
                    }
                elif float(da['score']) <= -0.3:
                    googleNLPResult = {
                        'value': "negative",
                        'score': str(da['score'])
                    }
                else:
                    googleNLPResult = {
                        'value': "neutral",
                        'score': str(da['score'])
                    }
                res[0]['ScoreGoogleNLP'] = googleNLPResult

                sStanfordCoreNLP = res[0]['ScoreStanfordCoreNLP']
                scoreStanfordCoreNLP = "{" + sStanfordCoreNLP + "}"
                da = json.loads(scoreStanfordCoreNLP)
                if float(da['score']) < 2:
                    stanfordCoreNLP = {
                        'value': "negative",
                        'score': str(da['score'])
                    }
                elif float(da['score']) > 2:
                    stanfordCoreNLP = {
                        'value': "positive",
                        'score': str(da['score'])
                    }
                else:
                    stanfordCoreNLP = {
                        'value': "neutral",
                        'score': str(da['score'])
                    }
                res[0]['ScoreStanfordCoreNLP'] = stanfordCoreNLP

                sAzure = res[0]['ScoreAzure']
                scoreAzure = "{" + sAzure + "}"
                da = json.loads(scoreAzure)
                if float(da['score']) < 0.4:
                    azureResult = {
                        'value': "negative",
                        'score': str(da['score'])
                    }
                elif float(da['score']) > 0.6:
                    azureResult = {
                        'value': "positive",
                        'score': str(da['score'])
                    }
                else:
                    azureResult = {
                        'value': "neutral",
                        'score': str(da['score'])
                    }
                res[0]['ScoreAzure'] = azureResult

                sIBM = res[0]['ScoreIBMNLP']
                scoreIBM = "{" + sIBM + "}"
                da = json.loads(scoreIBM)
                res[0]['ScoreIBM'] = {
                    'value': str(da['sentiment']),
                    'score': str(da['score'])
                }

                resCrawl['created_at'] = resCrawl['created_at']

                id = resCrawl['id']
                item.append(resCrawl)
                score.append(res)
            data = {
                'value': item,
                'score': score,
                'status': status,
                'index': id
            }
        else:

            data = {
                'value': [],
                'score': [],
                'status': status,
                'index': _index
            }
    else:
        print("Job ended.Twitter Scraping done.")
        data = {'value': [], 'score': [], 'status': status, 'index': _index}

    response.append(data)
    return JsonResponse(response, safe=False)
Пример #12
0
def GetWebCrawlerStatus(request):
    print("Ajax Calling - Retrieving web crawled dataset..")
    _index = 0
    status = False
    global SpiderWebCrawlerJOBID
    try:
        # global scrapyd
        scrapyd = ScrapydAPI('http://127.0.0.1:6800')
        if SpiderWebCrawlerJOBID != 'SpiderWebCrawlerKey':
            state = scrapyd.job_status(SCRAPYD_PROJECT_NAME,
                                       SpiderWebCrawlerJOBID)
            print("Web Crawler JOBID = " + SpiderWebCrawlerJOBID)
            print("Web Crawler JOB State = " + state)
            if state == RUNNING or state == PENDING:
                status = True
            else:
                status = False
    except ConnectionError:
        status = False

    response = []
    item = []
    score = []
    id = 0
    if status == True:
        _index = request.GET.get('index', None)
        _historyKey = request.GET.get('historyKey', None)
        print("Web Crawler DB Index = " + _index + " and HistoryKey = " +
              _historyKey)
        # result = WebCrawl.objects.using('SentimentAppDB').raw("SELECT * FROM [dbo].[CrawlResult] where [id] > " + str(_index) + " and [HistoryKey] = '" + _historyKey + "'")
        result = WebCrawl.objects.using('SentimentAppDB').filter(
            id__gt=_index, HistoryId=_historyKey).values()
        # if len(list(result)) != 0:
        #     for resCrawl in result:
        #         print(resCrawl.scoreid_id)
        #         res = list(Score.objects.using('SentimentAppDB').filter(id=resCrawl.scoreid_id).values())
        if len(list(result)) != 0:

            for resCrawl in result:
                res = list(
                    Score.objects.using('SentimentAppDB').filter(
                        id=resCrawl['scoreid_id']).values())

                sTextBlob = res[0]['ScoreTextBlob']
                scoreTextBlob = "{" + sTextBlob + "}"
                dt = json.loads(scoreTextBlob)
                if float(dt['polarity']) >= 0.3:
                    textBlobResult = {
                        'value': "positive",
                        'score': str(dt['polarity'])
                    }
                elif float(dt['polarity']) <= -0.3:
                    textBlobResult = {
                        'value': "negative",
                        'score': str(dt['polarity'])
                    }
                else:
                    textBlobResult = {
                        'value': "neutral",
                        'score': str(dt['polarity'])
                    }
                res[0]['ScoreTextBlob'] = textBlobResult

                sVader = res[0]['ScoreVader']
                scoreVader = "{" + sVader + "}"
                d = json.loads(scoreVader)
                if float(d['comp']) >= 0.3:
                    vaderResult = {
                        'value': "positive",
                        'score': str(d['comp'])
                    }
                elif float(d['comp']) <= -0.3:
                    vaderResult = {
                        'value': "negative",
                        'score': str(d['comp'])
                    }
                else:
                    vaderResult = {'value': "neutral", 'score': str(d['comp'])}
                res[0]['ScoreVader'] = vaderResult

                sGoogleNLP = res[0]['ScoreGoogleNLP']
                scoreGoogleNLP = "{" + sGoogleNLP + "}"
                da = json.loads(scoreGoogleNLP)
                if float(da['score']) >= 0.3:
                    googleNLPResult = {
                        'value': "positive",
                        'score': str(da['score'])
                    }
                elif float(da['score']) <= -0.3:
                    googleNLPResult = {
                        'value': "negative",
                        'score': str(da['score'])
                    }
                else:
                    googleNLPResult = {
                        'value': "neutral",
                        'score': str(da['score'])
                    }
                res[0]['ScoreGoogleNLP'] = googleNLPResult

                sStanfordCoreNLP = res[0]['ScoreStanfordCoreNLP']
                scoreStanfordCoreNLP = "{" + sStanfordCoreNLP + "}"
                da = json.loads(scoreStanfordCoreNLP)
                if float(da['score']) < 2:
                    stanfordCoreNLP = {
                        'value': "negative",
                        'score': str(da['score'])
                    }
                elif float(da['score']) > 2:
                    stanfordCoreNLP = {
                        'value': "positive",
                        'score': str(da['score'])
                    }
                else:
                    stanfordCoreNLP = {
                        'value': "neutral",
                        'score': str(da['score'])
                    }
                res[0]['ScoreStanfordCoreNLP'] = stanfordCoreNLP

                sAzure = res[0]['ScoreAzure']
                scoreAzure = "{" + sAzure + "}"
                da = json.loads(scoreAzure)
                if float(da['score']) < 0.4:
                    azureResult = {
                        'value': "negative",
                        'score': str(da['score'])
                    }
                elif float(da['score']) > 0.6:
                    azureResult = {
                        'value': "positive",
                        'score': str(da['score'])
                    }
                else:
                    azureResult = {
                        'value': "neutral",
                        'score': str(da['score'])
                    }
                res[0]['ScoreAzure'] = azureResult

                resCrawl['entryTime'] = resCrawl['entryTime'].strftime(
                    "%b %d %Y %H:%M:%S")
                id = resCrawl['id']
                item.append(resCrawl)
                score.append(res)
                print("LAST Row ID = " + str(id))

            data = {
                'value': item,
                'score': score,
                'status': status,
                'index': id
            }
        else:
            data = {
                'value': [],
                'score': [],
                'status': status,
                'index': _index
            }
    else:
        print("Job ended. Crawling done.")
        data = {'value': [], 'score': [], 'status': status, 'index': _index}

    response.append(data)
    return JsonResponse(response, safe=False)
Пример #13
0
 def __init__(self):
     scrapyd_url = input('请输入scrapyd地址: ')
     project = input('请输入项目名称: ')
     self.project = project
     self.scrapyd = ScrapydAPI(scrapyd_url)
Пример #14
0
def addByUrl_submit(request):
    start_url = request.POST.get('imdb_url', '')
    scrapyd = ScrapydAPI('http://localhost:6800')
    jobID = scrapyd.schedule('imdbscrapper', 'movie', start_url=start_url)
    return render(request, 'tommymovies/urlAdd.html', {'jobID': jobID})
        self.conn.commit()

        # Close the db connection when done.
        self.conn.close()

        # Count the number of products to be updated left
        self.count_products_update()

        self.logger.info("We still have number of products to update left: %s",
                         self.items_left)

        # Force the spider to stop when we cancel the job
        if reason is 'finished':
            if self.items_left >= 1000 and int(self.limit) >= 499:
                if (self.test == 1) or (self.test == '1'):
                    scrapyd = ScrapydAPI('http://127.0.0.1:6800')
                    scrapyd.schedule('scraper1',
                                     'ScrapeUpdateProduct',
                                     source=self.source,
                                     test='1',
                                     limit=self.limit,
                                     cats=self.updateCategories,
                                     descrp=self.updateDescriptions,
                                     images=self.updateImages,
                                     group=self.group)
                else:
                    self.logger.info(
                        "Don't Reschedule because it's a test run")
            else:
                self.logger.info(
                    "Don't Reschedule because limit is smaller then 1001 or items is lower than 1000"