from django.db import models from filter.models import BlackList from django.db.models import Q from utils.helpers import get_domain_from_url, clean_url from scrapyd_api import ScrapydAPI from datetime import timedelta import logging logger = logging.getLogger(__name__) # connect scrapyd service localhost = 'http://localhost:6800' scrapyd = ScrapydAPI(localhost) class Domains(models.Model): domain = models.TextField(max_length=200, unique=True) url = models.URLField() # parent, child, grandchild ... level = models.SmallIntegerField(default=0) src_domain = models.TextField(max_length=200, null=True, default='manual') # crawl info fullscan = models.BooleanField(null=True, default=False) status = models.TextField(default='created', max_length=10) infoscan = models.BooleanField(null=True, default=False) externalscan = models.BooleanField(null=True, default=False) created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) def save(self, *args, **kwargs):
def run_spider(): scrapyd = ScrapydAPI(env('SCRAPYD', default='http://0.0.0.0:6800')) scrapyd.schedule('default', spider='pubgshowcase')
from django.views.generic import View #class based views inherit from View from django.views.generic.base import TemplateView #needed for class-view templates from scrapyd_api import ScrapydAPI #from django.utils import URLUtil from .models import * from .forms import * from scrapy.utils.project import get_project_settings from dal import autocomplete import time from .sentiment_script_custom import clean_article from .sentiment_script_custom import sentiment_analysis # Create your views here. # connect scrapyd service scrapyd = ScrapydAPI( 'http://localhost:6800') #how do I make api calls to here? #in shell go to stockCrawl/stockBot/stockBot and type scrapyd to launch twisted to port 6800 #latest version of scrapyd-client pip install git+https://github.com/scrapy/scrapyd-client def landing(request): dow_url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol=DJI&outputsize=compact&apikey=TJ86LY8QFCFMQ44Z&datatype=json&interval=15min' sp500_url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol=INX&outputsize=compact&apikey=TJ86LY8QFCFMQ44Z&datatype=json&interval=15min' nasdaq_url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol=IXIC&outputsize=compact&apikey=TJ86LY8QFCFMQ44Z&datatype=json&interval=15min' test = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=AAPL&outputsize=compact&apikey=TJ86LY8QFCFMQ44Z&datatype=json&interval=15min" form = searchStockForm context = {'form': form} return render(request, 'stockSite/landing.html', context)
from .models import Bitcoin, Ethereum, Ethereumclassic, Bitcoincash, Litecoin, Eos, Neo, Bitshares from django.core import serializers import json, datetime from django.http import Http404, HttpResponse from django.db.models import Max from uuid import uuid4 from django.http import JsonResponse from scrapyd_api import ScrapydAPI import threading, time scrapyd_ip = 'localhost' scrapyd_port = 6800 scrapyd_url = f'http://{scrapyd_ip}:{scrapyd_port}' scrapyd = ScrapydAPI('http://localhost:6800') print(scrapyd.list_projects()) def test(request): return render(request, '../templates/gallery.html') # def schedule(project,spider): # url = scrapyd_url + f'/schedule.json' # params = { # "project":project, # "spider":spider # } # r = requests.post(url,data = params) # return r.json()
def get_scrapyd(client): if not client.auth: return ScrapydAPI(scrapyd_url(client.ip, client.port)) return ScrapydAPI(scrapyd_url(client.ip, client.port), auth=(client.username, client.password))
from urllib.parse import urlparse from django.core.validators import URLValidator from django.core.exceptions import ValidationError from django.views.decorators.http import require_POST, require_http_methods from rest_framework.decorators import api_view from django.shortcuts import render from django.http import JsonResponse from django.views.decorators.csrf import csrf_exempt from scrapyd_api import ScrapydAPI # from .utils import URLUtil from .models import ScrapyItem, Quote from rest_framework.views import APIView import os # Create your views here. # scrapyd = ScrapydAPI('http://localhost:6800') scrapyd = ScrapydAPI('http://0.0.0.0:' + str(os.environ.get("PORT", 6800))) def is_valid_url(url): validate = URLValidator() try: validate(url) # check if url format is valid except ValidationError: return False return True @csrf_exempt @api_view(['GET', 'POST']) # @require_http_methods(['POST', 'GET']) # only get and post
from django.shortcuts import render from .models import ScrapyModel from .forms import NameForm from django.http import JsonResponse from django.core.validators import URLValidator from django.core.exceptions import ValidationError from django.views.decorators.http import require_POST, require_http_methods from scrapyd_api import ScrapydAPI from .BERT_NER import Bert_NER from .Review_Sentiment import Review_Sentiment import os import pandas as pd import numpy as np scrapyd = ScrapydAPI('http://*****:*****@require_http_methods(['POST', 'GET'])
def crawl(request): '''Memulai proses crawling berita''' # Crawling hanya diterima dengan method POST scrapyd = ScrapydAPI('http://localhost:6800') if request.method == 'POST': url = request.POST.get('url', None) #Mengambil url yang diberikan website = request.POST.get('website', None) #Cek apakah benar benar url if not url: return JsonResponse({'error': 'Missing args'}) # Cek apakah url valid if not is_valid_url(url): return JsonResponse({'error': 'URL is invalid'}) # Cek apakah url sudah # if linkURL.objects.filter(link=url).exists(): # return JsonResponse({'error': 'URL sudah tersimpan dalam database'}) # else: # print("Hello World") # d, created = linkURL.objects.get_or_create(link=url) # if created : # d.save() domain = urlparse(url).netloc # parse the url and extract the domain unique_id = str(uuid4()) # create a unique ID. '''Custom Setting untuk scraping''' settings = { 'unique_id': unique_id, # unique ID 'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' } # Here we schedule a new crawling task from scrapyd. # Notice that settings is a special argument name. # But we can pass other arguments, though. # This returns a ID which belongs and will be belong to this task # We are goint to use that to check task's status. task = scrapyd.schedule("default", website, settings=settings, url=url, domain=domain) return JsonResponse({ 'task_id': task, 'unique_id': unique_id, 'status': 'started' }) elif request.method == 'GET': '''Untuk mengecek status crawling''' # We were passed these from past request above. Remember ? # They were trying to survive in client side. # Now they are here again, thankfully. <3 # We passed them back to here to check the status of crawling # And if crawling is completed, we respond back with a crawled data. task_id = request.GET.get('task_id', None) unique_id = request.GET.get('unique_id', None) url = request.GET.get('url', None) if not task_id or not unique_id: return JsonResponse({'error': 'Missing args'}) # Here we check status of crawling that just started a few seconds ago. # If it is finished, we can query from database and get results # If it is not finished we can return active status # Possible results are -> pending, running, finished status = scrapyd.job_status("default", task_id) '''Jika status = finished, berhenti cek status''' if status == 'finished': # d, created = linkURL.objects.get_or_create(link=url) # if created : # d.save() return JsonResponse({'data': url, 'status': 'finished'}) else: return JsonResponse({'status': status})
def __init__(self, server_url): self.server_url = server_url self.scrapyd_api = ScrapydAPI(server_url)
from django.contrib import auth from django.contrib.auth.decorators import user_passes_test from django.http import * from django.shortcuts import render from scrapyd_api import ScrapydAPI import requests from evaluate.YGT import train, predict SCRAPYD_URL = 'http://localhost:6800' PROJECT_NAME = 'lianjia' #PROJECT_NAME='default' SPIDER_NAME = 'lianjia_spider' scrapyd = ScrapydAPI(SCRAPYD_URL) # Create your views here. def index(request): return render(request, "myadmin/index.html") def loginCheck(func): def wrapper(request, *args, **kwargs): is_login = request.session.get('IS_LOGIN', False) if True: ret = func(request, *args, **kwargs) return ret else: return JsonResponse({"status": "not login"})
def GetTwitterCrawlerStatus(request): print("Twitter Ajax Calling") _index = 0 status = False global SpiderTwitterJOBID try: # global scrapyd scrapyd = ScrapydAPI('http://127.0.0.1:6800') if SpiderTwitterJOBID != 'SpiderTwitterKey': state = scrapyd.job_status(SCRAPYD_PROJECT_NAME, SpiderTwitterJOBID) print("Twitter JOBID = " + SpiderTwitterJOBID) print("Twitter JOB State = " + state) if state == RUNNING or state == PENDING: status = True else: status = False except ConnectionError: status = False response = [] item = [] score = [] id = 0 if status == True: _index = request.GET.get('index', None) _historyKey = request.GET.get('historyKey', None) print("DB Index = " + str(_index) + " and History key = " + str(_historyKey)) result = TwitterHistory.objects.using('SentimentAppDB').filter( id__gt=_index, historykey=_historyKey).values() if len(list(result)) != 0: for resCrawl in result: res = list( Score.objects.using('SentimentAppDB').filter( id=resCrawl['scoreid_id']).values()) sTextBlob = res[0]['ScoreTextBlob'] scoreTextBlob = "{" + sTextBlob + "}" dt = json.loads(scoreTextBlob) if float(dt['polarity']) >= 0.3: textBlobResult = { 'value': "positive", 'score': str(dt['polarity']) } elif float(dt['polarity']) <= -0.3: textBlobResult = { 'value': "negative", 'score': str(dt['polarity']) } else: textBlobResult = { 'value': "neutral", 'score': str(dt['polarity']) } res[0]['ScoreTextBlob'] = textBlobResult sVader = res[0]['ScoreVader'] scoreVader = "{" + sVader + "}" d = json.loads(scoreVader) if float(d['comp']) >= 0.3: vaderResult = { 'value': "positive", 'score': str(d['comp']) } elif float(d['comp']) <= -0.3: vaderResult = { 'value': "negative", 'score': str(d['comp']) } else: vaderResult = {'value': "neutral", 'score': str(d['comp'])} res[0]['ScoreVader'] = vaderResult sGoogleNLP = res[0]['ScoreGoogleNLP'] scoreGoogleNLP = "{" + sGoogleNLP + "}" da = json.loads(scoreGoogleNLP) if float(da['score']) >= 0.3: googleNLPResult = { 'value': "positive", 'score': str(da['score']) } elif float(da['score']) <= -0.3: googleNLPResult = { 'value': "negative", 'score': str(da['score']) } else: googleNLPResult = { 'value': "neutral", 'score': str(da['score']) } res[0]['ScoreGoogleNLP'] = googleNLPResult sStanfordCoreNLP = res[0]['ScoreStanfordCoreNLP'] scoreStanfordCoreNLP = "{" + sStanfordCoreNLP + "}" da = json.loads(scoreStanfordCoreNLP) if float(da['score']) < 2: stanfordCoreNLP = { 'value': "negative", 'score': str(da['score']) } elif float(da['score']) > 2: stanfordCoreNLP = { 'value': "positive", 'score': str(da['score']) } else: stanfordCoreNLP = { 'value': "neutral", 'score': str(da['score']) } res[0]['ScoreStanfordCoreNLP'] = stanfordCoreNLP sAzure = res[0]['ScoreAzure'] scoreAzure = "{" + sAzure + "}" da = json.loads(scoreAzure) if float(da['score']) < 0.4: azureResult = { 'value': "negative", 'score': str(da['score']) } elif float(da['score']) > 0.6: azureResult = { 'value': "positive", 'score': str(da['score']) } else: azureResult = { 'value': "neutral", 'score': str(da['score']) } res[0]['ScoreAzure'] = azureResult sIBM = res[0]['ScoreIBMNLP'] scoreIBM = "{" + sIBM + "}" da = json.loads(scoreIBM) res[0]['ScoreIBM'] = { 'value': str(da['sentiment']), 'score': str(da['score']) } resCrawl['created_at'] = resCrawl['created_at'] id = resCrawl['id'] item.append(resCrawl) score.append(res) data = { 'value': item, 'score': score, 'status': status, 'index': id } else: data = { 'value': [], 'score': [], 'status': status, 'index': _index } else: print("Job ended.Twitter Scraping done.") data = {'value': [], 'score': [], 'status': status, 'index': _index} response.append(data) return JsonResponse(response, safe=False)
def GetWebCrawlerStatus(request): print("Ajax Calling - Retrieving web crawled dataset..") _index = 0 status = False global SpiderWebCrawlerJOBID try: # global scrapyd scrapyd = ScrapydAPI('http://127.0.0.1:6800') if SpiderWebCrawlerJOBID != 'SpiderWebCrawlerKey': state = scrapyd.job_status(SCRAPYD_PROJECT_NAME, SpiderWebCrawlerJOBID) print("Web Crawler JOBID = " + SpiderWebCrawlerJOBID) print("Web Crawler JOB State = " + state) if state == RUNNING or state == PENDING: status = True else: status = False except ConnectionError: status = False response = [] item = [] score = [] id = 0 if status == True: _index = request.GET.get('index', None) _historyKey = request.GET.get('historyKey', None) print("Web Crawler DB Index = " + _index + " and HistoryKey = " + _historyKey) # result = WebCrawl.objects.using('SentimentAppDB').raw("SELECT * FROM [dbo].[CrawlResult] where [id] > " + str(_index) + " and [HistoryKey] = '" + _historyKey + "'") result = WebCrawl.objects.using('SentimentAppDB').filter( id__gt=_index, HistoryId=_historyKey).values() # if len(list(result)) != 0: # for resCrawl in result: # print(resCrawl.scoreid_id) # res = list(Score.objects.using('SentimentAppDB').filter(id=resCrawl.scoreid_id).values()) if len(list(result)) != 0: for resCrawl in result: res = list( Score.objects.using('SentimentAppDB').filter( id=resCrawl['scoreid_id']).values()) sTextBlob = res[0]['ScoreTextBlob'] scoreTextBlob = "{" + sTextBlob + "}" dt = json.loads(scoreTextBlob) if float(dt['polarity']) >= 0.3: textBlobResult = { 'value': "positive", 'score': str(dt['polarity']) } elif float(dt['polarity']) <= -0.3: textBlobResult = { 'value': "negative", 'score': str(dt['polarity']) } else: textBlobResult = { 'value': "neutral", 'score': str(dt['polarity']) } res[0]['ScoreTextBlob'] = textBlobResult sVader = res[0]['ScoreVader'] scoreVader = "{" + sVader + "}" d = json.loads(scoreVader) if float(d['comp']) >= 0.3: vaderResult = { 'value': "positive", 'score': str(d['comp']) } elif float(d['comp']) <= -0.3: vaderResult = { 'value': "negative", 'score': str(d['comp']) } else: vaderResult = {'value': "neutral", 'score': str(d['comp'])} res[0]['ScoreVader'] = vaderResult sGoogleNLP = res[0]['ScoreGoogleNLP'] scoreGoogleNLP = "{" + sGoogleNLP + "}" da = json.loads(scoreGoogleNLP) if float(da['score']) >= 0.3: googleNLPResult = { 'value': "positive", 'score': str(da['score']) } elif float(da['score']) <= -0.3: googleNLPResult = { 'value': "negative", 'score': str(da['score']) } else: googleNLPResult = { 'value': "neutral", 'score': str(da['score']) } res[0]['ScoreGoogleNLP'] = googleNLPResult sStanfordCoreNLP = res[0]['ScoreStanfordCoreNLP'] scoreStanfordCoreNLP = "{" + sStanfordCoreNLP + "}" da = json.loads(scoreStanfordCoreNLP) if float(da['score']) < 2: stanfordCoreNLP = { 'value': "negative", 'score': str(da['score']) } elif float(da['score']) > 2: stanfordCoreNLP = { 'value': "positive", 'score': str(da['score']) } else: stanfordCoreNLP = { 'value': "neutral", 'score': str(da['score']) } res[0]['ScoreStanfordCoreNLP'] = stanfordCoreNLP sAzure = res[0]['ScoreAzure'] scoreAzure = "{" + sAzure + "}" da = json.loads(scoreAzure) if float(da['score']) < 0.4: azureResult = { 'value': "negative", 'score': str(da['score']) } elif float(da['score']) > 0.6: azureResult = { 'value': "positive", 'score': str(da['score']) } else: azureResult = { 'value': "neutral", 'score': str(da['score']) } res[0]['ScoreAzure'] = azureResult resCrawl['entryTime'] = resCrawl['entryTime'].strftime( "%b %d %Y %H:%M:%S") id = resCrawl['id'] item.append(resCrawl) score.append(res) print("LAST Row ID = " + str(id)) data = { 'value': item, 'score': score, 'status': status, 'index': id } else: data = { 'value': [], 'score': [], 'status': status, 'index': _index } else: print("Job ended. Crawling done.") data = {'value': [], 'score': [], 'status': status, 'index': _index} response.append(data) return JsonResponse(response, safe=False)
def __init__(self): scrapyd_url = input('请输入scrapyd地址: ') project = input('请输入项目名称: ') self.project = project self.scrapyd = ScrapydAPI(scrapyd_url)
def addByUrl_submit(request): start_url = request.POST.get('imdb_url', '') scrapyd = ScrapydAPI('http://localhost:6800') jobID = scrapyd.schedule('imdbscrapper', 'movie', start_url=start_url) return render(request, 'tommymovies/urlAdd.html', {'jobID': jobID})
self.conn.commit() # Close the db connection when done. self.conn.close() # Count the number of products to be updated left self.count_products_update() self.logger.info("We still have number of products to update left: %s", self.items_left) # Force the spider to stop when we cancel the job if reason is 'finished': if self.items_left >= 1000 and int(self.limit) >= 499: if (self.test == 1) or (self.test == '1'): scrapyd = ScrapydAPI('http://127.0.0.1:6800') scrapyd.schedule('scraper1', 'ScrapeUpdateProduct', source=self.source, test='1', limit=self.limit, cats=self.updateCategories, descrp=self.updateDescriptions, images=self.updateImages, group=self.group) else: self.logger.info( "Don't Reschedule because it's a test run") else: self.logger.info( "Don't Reschedule because limit is smaller then 1001 or items is lower than 1000"