def getSimilarity(): words1 = crawl(request.form["url1"]) documents.append(words1) data1 = sorted(words1.items(), key=operator.itemgetter(1), reverse=True) words2 = crawl(request.form["url2"]) documents.append(words2) data2 = sorted(words2.items(), key=operator.itemgetter(1), reverse=True) similarity = getCosineSimilarity(documents, words1, words2) commonData = [] for d in data1: try: commonData.append((d[0], d[1]+data2[findIndex(d[0], data2)][1])) except ValueError: print("", end="") commonData.sort(key = lambda word : word[1], reverse=True) dataResult = "" for d in commonData: dataResult += getDataTag(d) return render_template("result.html", res = f"<Similarity> : {similarity}\n\n<Common Word : Word Num>\n\n{dataResult}")
def runCrawl(url_list, db_name='crawl-data', manual=True, append=False): ''' Basic wrapper to OpenWPM crawl script. url_list is a list of URLs to crawl: ['https://abc.com', 'https://xyz.net'] etc db_name is the name of the database to write crawl output to manual can be set to False to skip the crawl data deletion warning append can be set to True to append to existing crawl data instead of deleting it ''' if manual and not append: go = input( "Proceeding will delete the old crawl data. Enter Y to continue.\n" ) else: go = 'y' if go.lower() == 'y': if not append: try: os.remove('datadir/{}.sqlite'.format(db_name)) except FileNotFoundError: print("Old crawl data not found. Continuing.") crawl.crawl(url_list, db_name) else: print("Aborting.")
def daily(self): print("Running Daily Job") #the ingest function sorts and moves files by date into the working/media directory ingest.ingest(ingestdir, workingdir) #the crawl function performs a hash index of all files in the target directories workingdirsum = crawl.crawl(True, workingdir, jsondatadir) archivedirsum = crawl.crawl(False, archivedir, jsondatadir) #the dedupe function combines all hash indexes and analyzes the dataset for duplicates data_files = glob.glob(jsondatadir + '/*.json') #run the dedupe function dedupe.dedupe(data_files, duplicatedir)
def daemon_job(interval): """ main function of the crawler daemon. interval: time in seconds between each crawl """ time.sleep(3) # Wait for api server to start first while True: try: crawl() process_notification() except Exception: traceback.print_exc() time.sleep(interval)
def main(argv=None): try: db_cnx = db_conn.db_conn() url = sys.argv[1] crawl.crawl(url, db_cnx) db_cnx.close() except IndexError: print "Please run with a url (ex. \"python main.py http://www.example.com\")" except KeyboardInterrupt: print "Stopped crawling on interrupt." db_cnx.close()
def main(): db_dirname = './data/' try: os.mkdir(db_dirname) except FileExistsError: pass get_web_req_gap = (eval('lambda : ' + sys.argv[1]) if len(sys.argv) > 1 else lambda: random.uniform(1, 3)) try: crawl.init(db_dirname + 'db.tinydb', get_web_req_gap) crawl.authorize() crawl.crawl(seeds=crawl.url2json('https://api.github.com/users')) except Exception as e: crawl.cleanup() raise e
def main() : db_dirname = './data/' try : os.mkdir(db_dirname) except FileExistsError : pass get_web_req_gap = (eval('lambda : ' + sys.argv[1]) if len(sys.argv)>1 else lambda : random.uniform(1,3)) try : crawl.init(db_dirname + 'db.tinydb', get_web_req_gap) crawl.authorize() crawl.crawl(seeds=crawl.url2json('https://api.github.com/users')) except Exception as e : crawl.cleanup() raise e
def main(): url = input('URL: ') if not url: print('URL이 입력되지 않았습니다.') return input_type = input('1-TXT, 2-HTML: ') if input_type not in ['1', '2']: print('잘못된 입력 타입') return output_unit = input('출력 묶음 단위(자연수): ') if r_non_digit.findall(output_unit): print('자연수가 아닌 값 입력') return output_unit = int(output_unit) text = crawl(url) if input_type == '2': text = remove_html_tag(text) text = remove_non_alnum(text) text = sort_ascending(text) shares, rest = divide_text(text, output_unit) print('몫: ', ', '.join(shares)) print('나머지: ', rest)
def spider_scheduling(SpiderGlobalVariable, UrlRule): ''' SpiderGlobalVariable ''' for i in init_urlnode(SpiderGlobalVariable.start_url, UrlRule): SpiderGlobalVariable.global_urlnode_queue.put((0, i)) while exit_condition(SpiderGlobalVariable): if SpiderGlobalVariable.htmlnode_queue.qsize() > 0: html_node = SpiderGlobalVariable.htmlnode_queue.get() linklist = crawl(html_node.url, html_node.html) for i in linklist: url = i[1] method = i[0] data = i[2] depth = html_node.depth referer = html_node.url i = UrlNode(url, referer, depth, method, data) if i.depth <= SpiderGlobalVariable.depth and UrlRule.check_url( i.check_url): if is_netloc(i.url): SpiderGlobalVariable.global_urlnode_queue.put((0, i)) else: SpiderGlobalVariable.global_urlnode_queue.put( (random.randint(1, 5), i)) else: SpiderGlobalVariable.refuse_count += 1
def spider_scheduling(SpiderGlobalVariable,UrlRule): ''' SpiderGlobalVariable ''' for i in init_urlnode(SpiderGlobalVariable.start_url,UrlRule): SpiderGlobalVariable.global_urlnode_queue.put((0,i)) while exit_condition(SpiderGlobalVariable): if SpiderGlobalVariable.htmlnode_queue.qsize() > 0: html_node = SpiderGlobalVariable.htmlnode_queue.get() linklist = crawl(html_node.url, html_node.html) for i in linklist: url = i[1] method = i[0] data = i[2] depth = html_node.depth referer = html_node.url i = UrlNode(url, referer, depth, method, data) if i.depth <= SpiderGlobalVariable.depth and UrlRule.check_url(i.check_url): if is_netloc(i.url): SpiderGlobalVariable.global_urlnode_queue.put((0,i)) else: SpiderGlobalVariable.global_urlnode_queue.put((random.randint(1,5),i)) else: SpiderGlobalVariable.refuse_count += 1
def search(request): if request.method == "POST": keyword = request.POST.get("keyword") print(keyword) url = "http://www.enuri.com/search.jsp?nosearchkeyword=&issearchpage=&searchkind=&es=&c=&ismodelno=false&hyphen_2=false&from=list&owd=&keyword={0}".format( keyword) makewordcloud.cloud(sementic_classifier.test(crawl.crawl(url))) return render(request, 'search.html')
def run(self): # time.sleep(0.3) # 测试,使用相同数据 # is_ok, content, satellite_data = crawl_test(self.satellite_data, self.model, "", self.create_time, self.end_time) is_ok, content, satellite_data = crawl(self.satellite_data, self.username, self.password, self.model, self.telemetry_num, self.create_time, self.end_time) self._signal.emit((is_ok, content, satellite_data))
def main(): elastic = Elastic() config = get_params() path = config['crawler']['path'] # download imdb files wget.download(BASICS, out=path) wget.download(RATINGS, out=path) wget.download(EPISODES, out=path) # crawl movies and add to elastic crawl() elastic.insert_elastic() # remove files os.remove(f"{config['crawler']['path']}/title.basics.tsv.gz") os.remove(f"{config['crawler']['path']}/title.ratings.tsv.gz") os.remove(f"{config['crawler']['path']}/title.episode.tsv.gz")
def proc(u, epoch): ip = u["ip"] user = u["id"] service = SERVICE_ID flag = generateFlag(user, service, epoch) point = crawl(ip, flag) print("epoch:{} user:{} service:{} ip:{} is {}pt".format( epoch, user, service, ip, point)) sendStatus(user, service, epoch, point)
def MainXSS(): print PrintXssMask() output = MakeSelection() if output is None: return for rhost in crawl(): StartAttack(output, rhost)
def crawl(self): print("Crawling") #the crawl function performs a hash index of all files in the target directories parser = argparse.ArgumentParser( description='Crawl the dirs and create a hash index') parser.add_argument('-f', '--force', action='store_true', default=False, help='forces indexing of all directories') parser.add_argument('-p', '--path', help='pass a directory to crawl') args = parser.parse_args(sys.argv[2:]) #Crawl a provided directory if args.path: crawl.crawl(args.force, args.path, jsondatadir) else: ingestsum = crawl.crawl(args.force, ingestdir, jsondatadir) workingdirsum = crawl.crawl(args.force, workingdir, jsondatadir) archivedirsum = crawl.crawl(args.force, archivedir, jsondatadir)
def do(): target = int(request.form.get("twa")) result = dict() twa = [] title = [] url = [] twa, title, url = crawl(target) result["twa"] = twa result["title"] = title result["url"] = url print(result) return jsonify(result)
def getData(): words = crawl(request.form["url"]) documents.append(words) global data data = sorted(words.items(), key=operator.itemgetter(1), reverse=True) dataResult = "" for d in data: dataResult += getDataTag(d) return render_template("result.html", res = f"<Word : Word Num>\n\n{dataResult}")
def pbsc(username): ress = [] ressExtro = 'Normal' # resulf of prediction ressAgree = 'Normal' ressCons = 'Normal' crawl.crawl(username) clr1.clean1(username) clr2.clean2(username) # logic Extroversion lenTW = extro.lenTW(username) if lenTW != 0: extroValue = extro.driver(username) if extroValue[1] >= pExto and extroValue[3] <= qExtro: ressExtro = 'High' elif extroValue[3] >= qExtro and extroValue[1] <= pExto: ressExtro = 'Low' # logic Agreeableness agreValue = agree.driver(username) if agreValue[1] >= pAgre and agreValue[3] <= qAgre: ressAgree = 'High' elif agreValue[3] >= qAgre and agreValue[1] <= pAgre: ressAgree = 'Low' # logic Conscientiousnes conValue = cons.driver(username) # print(conValue) if conValue[1] >= pCons and conValue[3] <= qCons: ressCons = 'High' elif conValue[3] >= qCons and conValue[1] <= pCons: ressCons = 'Low' else: ressAgree = 'Normal' ressCons = 'Normal' ressExtro = 'Normal' ress.append(username) ress.append(ressExtro) ress.append(ressAgree) ress.append(ressCons) return ress
def Test(): TITLE, URL = crawl.crawl(inUrl,depth) for i in range(0,len(URL)): print "Title: ", TITLE[i], "\t", "URL: ", URL[i] print ' Check the correctness of this Title:' print ' is_alpha(TITLE[i]) and TITLE[i].isalpha(): ' print ' ',is_alpha(TITLE[i]),TITLE[i].isalpha() print ' is_alnum(TITLE[i]) and TITLE[i].isalnum(): ' print ' ',is_alnum(TITLE[i]), TITLE[i].isalnum() print ' startswith(TITLE[i],TITLE[i]) and TITLE[i].startswith(TITLE[i]):' print ' ',startswith(TITLE[i],TITLE[i]), TITLE[i].startswith(TITLE[i]) print ' is_in(TITLE[i],TITLE[i]) and TITLE[i] in TITLE[i]' print ' ',is_in(TITLE[i],TITLE[i]), TITLE[i] in TITLE[i] print '\n'
def test(self): print("Running Full Test Sequence") #the ingest function sorts and moves files by date into the working/media directory ingest.ingest(ingestdir, workingdir) #the crawl function performs a hash index of all files in the target directories workingdirsum = crawl.crawl(True, workingdir, jsondatadir) archivedirsum = crawl.crawl(False, archivedir, jsondatadir) #the dedupe function combines all hash indexes and analyzes the dataset for duplicates data_files = glob.glob(jsondatadir + '/*.json') #run the dedupe function dedupe.dedupe(data_files, duplicatedir) #after the dedupe function has moved duplicaes out, reindex workingdirsum = crawl.crawl(True, workingdir, jsondatadir) #the archive function pulls from the working/media directory and pools into sized volumes archive.archive(archivedir, jsondatadir, workingdir, mediasize) #validate that all files in duplicates exist elsewhere before moving to validated validate.validate(duplicatedir, workingdir, archivedir, validateddir) print("Daily Job Completed Successfully")
def print_result(): if request.method == 'POST': url = request.form['url'] input_type = request.form['type'] output_unit = int(request.form['unit']) text = crawl(url) if input_type.upper() == 'HTML': text = remove_html_tag(text) text = remove_non_alnum(text) text = sort_ascending(text) shares, rest = divide_text(text, output_unit) return render_template('main.html', shares=shares, rest=rest) else: return redirect(url_for('main_page'))
def run(self): """ deploys bundled basicnode module to plab nodes using plab_assistant. crawls the ring continuously for 48 hours and then gathers logs. """ plab = plab_assistant.plab_assistant("install", nodes=None, username=self.slice_name, \ path_to_files=self.path_to_files, ssh_key=self.ssh_key) plab.run() os.chdir(self.base_path + "/node") os.system("sed 's/<Enabled>false/<Enabled>true/' -i node.config." + self.slice_name) # start local basicnode without cronolog... start_basicnode = "./basicnode node.config." + self.slice_name + " &> log &" p = subprocess.Popen(start_basicnode.split(' ')) self.local_basicnode_pid = p.pid time.sleep(60) os.chdir(self.base_path) start_utc = datetime.datetime.utcnow() test_length = datetime.timedelta(hours=48) content = open("node/node.config." + self.slice_name).read() port_line = re.search("<XmlRpcManager>.*</XmlRpcManager>", content, re.S).group() port = int(re.search("\d+", port_line).group()) while datetime.datetime.utcnow() - start_utc < test_length: nodes = crawl.crawl(port) consistency, count = crawl.check_results(nodes) os.chdir(self.base_path) f = open("crawl.csv", "a") f.write( str(time.asctime()) + ", " + str(consistency) + ", " + str(count) + "\n") f.close() time.sleep(60 * 15) # done with the test, start getting logs and cleaning up. plab = plab_assistant.plab_assistant("get_logs", nodes=None, username=self.slice_name, \ path_to_files=self.path_to_files, ssh_key=self.ssh_key) plab.run() os.system("zip -r9 results.zip logs output.log crawl.csv") # Actually not necessary because installation cleans nodes first. plab = plab_assistant.plab_assistant("uninstall", nodes=None, username=self.slice_name, \ path_to_files=self.path_to_files, ssh_key=self.ssh_key) plab.run() try: os.kill(self.local_basicnode_pid, signal.SIGKILL) except: pass
def run(self): """ deploys bundled basicnode module to plab nodes using plab_assistant. crawls the ring continuously for 48 hours and then gathers logs. """ plab = plab_assistant.plab_assistant("install", nodes=None, username=self.slice_name, \ path_to_files=self.path_to_files, ssh_key=self.ssh_key) plab.run() os.chdir(self.base_path + "/node") os.system("sed 's/<Enabled>false/<Enabled>true/' -i node.config." + self.slice_name) # start local basicnode without cronolog... start_basicnode = "./basicnode node.config." + self.slice_name + " &> log &" p = subprocess.Popen(start_basicnode.split(' ')) self.local_basicnode_pid = p.pid time.sleep(60) os.chdir(self.base_path) start_utc = datetime.datetime.utcnow() test_length = datetime.timedelta(hours = 48) content = open("node/node.config." + self.slice_name).read() port_line = re.search("<XmlRpcManager>.*</XmlRpcManager>", content, re.S).group() port = int(re.search("\d+", port_line).group()) while datetime.datetime.utcnow() - start_utc < test_length: nodes = crawl.crawl(port) consistency, count = crawl.check_results(nodes) os.chdir(self.base_path) f = open("crawl.csv", "a") f.write(str(time.asctime()) + ", " + str(consistency) + ", " + str(count) + "\n") f.close() time.sleep(60 * 15) # done with the test, start getting logs and cleaning up. plab = plab_assistant.plab_assistant("get_logs", nodes=None, username=self.slice_name, \ path_to_files=self.path_to_files, ssh_key=self.ssh_key) plab.run() os.system("zip -r9 results.zip logs output.log crawl.csv") # Actually not necessary because installation cleans nodes first. plab = plab_assistant.plab_assistant("uninstall", nodes=None, username=self.slice_name, \ path_to_files=self.path_to_files, ssh_key=self.ssh_key) plab.run() try: os.kill(self.local_basicnode_pid, signal.SIGKILL) except: pass
def crawler(self): web_tb = self.webvar1.get() web_jd = self.webvar2.get() web_am = self.webvar3.get() web_all = self.webvar4.get() depth = self.dp_var.get() goods = self.goods_var.get() cr = crawl.crawl() if web_all == 1: cr.crawler_all(depth, goods) messagebox.showinfo(title='result', message='全部爬取完成') else: if web_jd == 1: cr.crawler_jd(depth, goods) messagebox.showinfo(title='result', message='jd爬取完成') if web_tb == 1: cr.crawler_tb(depth, goods) messagebox.showinfo(title='result', message='tb爬取完成') if web_am == 1: cr.crawler_am(depth, goods) messagebox.showinfo(title='result', message='TMALL爬取完成')
def main(): try: optlist, args = getopt.getopt(sys.argv[1:], "", ["debug", "port="]) port = 10000 debug = False for k,v in optlist: if k == "--port": port = int(v) elif k == "--debug": debug = True except: print usage return nodes = crawl.crawl(port, debug = debug) print crawl.check_results(nodes) for task in tasks: print task.run(port, nodes, debug)
def crawling(): while True: print(time.ctime()) crawl() time.sleep(10)
from apscheduler.schedulers.background import BackgroundScheduler import pandas as pd import time import crawl import datetime import mail from configparser import ConfigParser cfg = ConfigParser() cfg.read('config.ini') fmk_news = crawl.crawl(url=cfg["fmk_news"]["url"], title=cfg["fmk_news"]["title"], time_=cfg["fmk_news"]["time_"], site=cfg["fmk_news"]["site"], category="News", sub="\[[0-9]{1,4}\]") fmk_fashion = crawl.crawl(url=cfg["fmk_fashion"]["url"], title=cfg["fmk_fashion"]["title"], time_=cfg["fmk_fashion"]["time_"], site=cfg["fmk_fashion"]["site"], category="Fashion", sub="\[[0-9]{1,4}\]") naver_finance_news = crawl.crawl(url=cfg["naver_finance_news"]["url"], title=cfg["naver_finance_news"]["title"], time_=cfg["naver_finance_news"]["time_"], site=cfg["naver_finance_news"]["site"], address=cfg["naver_finance_news"]["address"], category="Finance", sub="\n[\s\S]*")
def scheduled_job(): print('Crawl script is run everyday at 6am [timezone = +7.00]') crawl() print("Craw: Done")
import tweepy class TwitterAPI: """ Class for accessing the Twitter API. Requires API credentials to be available in environment variables. These will be set appropriately if the bot was created with init.sh included with the heroku-twitterbot-starter """ def __init__(self): consumer_key = os.environ.get('TWITTER_CONSUMER_KEY') consumer_secret = os.environ.get('TWITTER_CONSUMER_SECRET') auth = tweepy.OAuthHandler(consumer_key, consumer_secret) access_token = os.environ.get('TWITTER_ACCESS_TOKEN') access_token_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET') auth.set_access_token(access_token, access_token_secret) self.api = tweepy.API(auth) def tweet(self, message): """Send a tweet""" self.api.update_status(message) if __name__ == "__main__": twitter = TwitterAPI() while True: tweet = crawl() if tweet: twitter.tweet(tweet) time.sleep(21600 + random.randint(0, 21600)) # wait 6-12 hours
#-*- encoding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding("utf-8") import Queue import thread from bs4 import BeautifulSoup urllis_q = Queue.Queue() from crawl import crawl import time crawler_task1 = crawl() crawler_task2 = crawl() def list_zhidao_url(): url = '''http://zhidao.baidu.com/search?word=%D0%A1%C0%B1%BD%B7%CA%D6%BB%FA%D4%F5%C3%B4%D1%F9&ie=gbk&site=-1&sites=0&date=0&pn=''' for i in range(76): yield url + str(i*10) def list_zhidao_url_xiaomi(): url = '''http://zhidao.baidu.com/search?word=%D0%A1%C3%D7%CA%D6%BB%FA%D4%F5%C3%B4%D1%F9&ie=gbk&site=-1&sites=0&date=0&pn=''' for i in range(76): yield url + str(i*10) def list_zhidao_url_coolpad(): url = '''http://zhidao.baidu.com/search?word=%BF%E1%C5%C9%B4%F3%C9%F1%D4%F5%C3%B4%D1%F9&ie=gbk&site=-1&sites=0&date=0&pn=''' for i in range(73): yield url + str(i*10)
from crawl import crawl import re c = crawl() c.set_root_link('https://www.gog.com/') c.set_filename('gogData.db') c.set_link_checker(lambda s: '/game/' in s) c.set_link_rule(lambda link: re.search(".*www\.gog\.com.*", link) is not None) c.set_content_rule('name', lambda s: s.find('h1', { 'class': 'header__title' }).text) c.set_content_rule( 'price', lambda s: str( float( re.search( '<span\ class="_price">(\ |\n)+([0-9]|\.)+(\ |\n)+</span>', s.prettify()).group()[21:-7].strip()) * 30)) c.set_content_rule( 'discount', lambda s: str( float(s.find('span', { 'class': 'buy-price__new' }).text) * 30)) c.set_content_rule( 'feature', lambda s: ", ".join([ i.text.strip() for i in s.findAll('span', {'class': 'game-features__title'}) ])) c.run(1)
import sys import webbrowser import re import time import crawl SHOP_URL = 'http://www.ecocion-shop.de/web/main.php/shop/addCart?live_pid=' DELAY = 3 if __name__ == '__main__': with open(sys.argv[1], 'r') as input_file: fresh_item_list = ['empty row'] crawl.crawl(fresh_item_list, fresh_item_list, crawl.FRESH_CATEGORIES_LIST) if 'frisch' not in input_file.name: dry_item_list = ['empty row'] crawl.crawl(fresh_item_list, dry_item_list, crawl.ALL_ITEMS) with open(crawl.DRY_ARTICLES_CSV, 'r') as dry_file: # for comparison to detect price changes old_dry_item_list = dry_file.readlines() # find first occurrence of 'Nummer Menge Name' order = input_file.read().split('Nummer Menge Name', 1)[1] # split into lines lines = order.split('\n') ask_to_proceed = False id_count_name_list = [] for line in lines: splitted_line = re.split(r' +', line) # e.g. ['', '392459', '2', 'Bioland', 'Joghurt', 'Natur', '1L'] if len(splitted_line) > 3:
create_dirs("temp/") #Logging sys.stdout = Logger() #Check Python version print("Using Python version " +\ str(sys.version_info.major) + "." +\ str(sys.version_info.minor) + "." +\ str(sys.version_info.micro) + " " +\ sys.version_info.releaselevel + " " +\ str(int(round(log(sys.maxint * 2 + 2, 2)))) + "bit") if sys.version_info.major != 2: print("Not supported; use Python 2") elif 0: print("") #start crawler = crawl() crawler.start() try: while(True): time.sleep(2) except KeyboardInterrupt: try: print(" -> Closing program") time.sleep(5) print(" -> Analyzing") except KeyboardInterrupt: print(" -> Skipping analysis") crawler.quit_analyze = False crawler.quit = True crawler.join() print("I don't think there is a need to run another Steam crawler") raw_input("Finished")
url = urllib.parse.urlparse(line.strip().split(',', 1)[0]) assert url.scheme.startswith('http') str_url = '{}://{}/'.format(url.scheme, url.netloc) if str_url not in unique_urls: unique_urls.add(str_url) queue.put(str_url) os.makedirs('data/output', exist_ok=True) links_queue = queue.Queue() NUM_THREADS=1 threads = [] for i in range(NUM_THREADS): threads.append(threading.Thread(target=run, args=(i, links_queue))) threads[-1].start() if len(sys.argv) < 2: crawl.crawl(links_queue) else: read_links_from_file(sys.argv[1], links_queue) for i in range(NUM_THREADS): links_queue.put(None) for thread in threads: thread.join()
import crawl soup = crawl.crawl("https://www.ii.uni.wroc.pl/~marcinm/", 13, crawl.action) soup2 = crawl.crawl("https://pl.wikipedia.org/wiki/Python", 3, crawl.action) soup3 = crawl.crawl('https://www.ii.uni.wroc.pl/~marcinm/dyd/python/', 3, crawl.action) print("\n\nII.UNI.WROC.PL/~MARCINM\n\n") for i in soup: print(i) print("\n\nWIKIPEDIA\n\n") for i in soup2: print(i) print("\n\nDYD/PYTHON/\n\n") for i in soup3: print(i) """ g1, g2- generator def sum (g1, g2): for i in g1: yield i for i in g2: yield i def sum (g1, g2): yield from g1 yield from g2 def f(): yield <...>
def setUp(self): self.crawl = crawl(path)
# -*- coding: utf-8 -*- __author__ = 'Taikor' from crawl import crawl from output_excel import output_excel from copyfile import copyfile if __name__ == "__main__": crawl() output_excel() copyfile(r'C:\xampp\htdocs\cosmetic_price_check\data\jd\MianBuDiShuang') print("Crawler Task Completed !")
def crawl_train_schedules(): debug("Crawling Train schedules") count = db.query(func.count(TempTrain.id)).scalar() for i, row in enumerate(db.query(TempTrain.url)): debug("Crawing %s of %s. Remaining %s" % (i, count, count-i)) crawl(row.url)
def route_generate(): crawl.crawl() return redirect(url_for("route_download"))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Mar 13 10:17:10 2018 @author: redman """ import requests from bs4 import BeautifulSoup import argparse import crawl import push import popular import findword import sys if __name__ == '__main__': if (sys.argv[1] == 'crawl'): #print('boo') crawl.crawl() if (sys.argv[1] == 'push'): #print('good') push.countpush(sys.argv[2], sys.argv[3]) if (sys.argv[1] == 'popular'): popular.countpopular(sys.argv[2], sys.argv[3]) if (sys.argv[1] == 'findword'): #print('hello') #print(sys.argv[2]) findword.word(sys.argv[3], sys.argv[4], sys.argv[2])
def server(): condition = request.args.get('keyword') return crawl(condition)