Exemplo n.º 1
0
def getSimilarity():
    words1 = crawl(request.form["url1"])
    documents.append(words1)
    data1 = sorted(words1.items(), key=operator.itemgetter(1), reverse=True)

    words2 = crawl(request.form["url2"])
    documents.append(words2)
    data2 = sorted(words2.items(), key=operator.itemgetter(1), reverse=True)

    similarity = getCosineSimilarity(documents, words1, words2)

    commonData = []
    for d in data1:
        try:
            commonData.append((d[0], d[1]+data2[findIndex(d[0], data2)][1]))
        except ValueError:
            print("", end="")

    commonData.sort(key = lambda word : word[1], reverse=True)

    dataResult = ""
    for d in commonData:
        dataResult += getDataTag(d)

    return render_template("result.html", res = f"<Similarity> : {similarity}\n\n<Common Word : Word Num>\n\n{dataResult}")
Exemplo n.º 2
0
def runCrawl(url_list, db_name='crawl-data', manual=True, append=False):
    '''
    Basic wrapper to OpenWPM crawl script. 
        url_list is a list of URLs to crawl: ['https://abc.com', 
            'https://xyz.net'] etc
        db_name is the name of the database to write crawl output to
        manual can be set to False to skip the crawl data deletion warning
        append can be set to True to append to existing crawl data instead of
    deleting it
    '''
    if manual and not append:
        go = input(
            "Proceeding will delete the old crawl data. Enter Y to continue.\n"
        )
    else:
        go = 'y'

    if go.lower() == 'y':
        if not append:
            try:
                os.remove('datadir/{}.sqlite'.format(db_name))
            except FileNotFoundError:
                print("Old crawl data not found. Continuing.")

        crawl.crawl(url_list, db_name)
    else:
        print("Aborting.")
Exemplo n.º 3
0
    def daily(self):
        print("Running Daily Job")
        #the ingest function sorts and moves files by date into the working/media directory
        ingest.ingest(ingestdir, workingdir)

        #the crawl function performs a hash index of all files in the target directories
        workingdirsum = crawl.crawl(True, workingdir, jsondatadir)
        archivedirsum = crawl.crawl(False, archivedir, jsondatadir)

        #the dedupe function combines all hash indexes and analyzes the dataset for duplicates
        data_files = glob.glob(jsondatadir + '/*.json')
        #run the dedupe function
        dedupe.dedupe(data_files, duplicatedir)
Exemplo n.º 4
0
def daemon_job(interval):
    """
        main function of the crawler daemon.
        interval: time in seconds between each crawl
    """
    time.sleep(3)  # Wait for api server to start first
    while True:
        try:
            crawl()
            process_notification()
        except Exception:
            traceback.print_exc()
        time.sleep(interval)
Exemplo n.º 5
0
def main(argv=None):
    try:
        db_cnx = db_conn.db_conn()
        url = sys.argv[1]

        crawl.crawl(url, db_cnx)

        db_cnx.close()
    except IndexError:
        print "Please run with a url (ex. \"python main.py http://www.example.com\")"
    except KeyboardInterrupt:
        print "Stopped crawling on interrupt."
        db_cnx.close()
Exemplo n.º 6
0
def main():
    db_dirname = './data/'
    try:
        os.mkdir(db_dirname)
    except FileExistsError:
        pass
    get_web_req_gap = (eval('lambda : ' + sys.argv[1])
                       if len(sys.argv) > 1 else lambda: random.uniform(1, 3))

    try:
        crawl.init(db_dirname + 'db.tinydb', get_web_req_gap)
        crawl.authorize()
        crawl.crawl(seeds=crawl.url2json('https://api.github.com/users'))
    except Exception as e:
        crawl.cleanup()
        raise e
Exemplo n.º 7
0
def main() :
    db_dirname = './data/'
    try :
        os.mkdir(db_dirname)
    except FileExistsError :
        pass
    get_web_req_gap = (eval('lambda : ' + sys.argv[1]) if len(sys.argv)>1 
        else lambda : random.uniform(1,3))

    try :
        crawl.init(db_dirname + 'db.tinydb', get_web_req_gap)
        crawl.authorize()
        crawl.crawl(seeds=crawl.url2json('https://api.github.com/users'))
    except Exception as e :
        crawl.cleanup()
        raise e
Exemplo n.º 8
0
def main():
    url = input('URL: ')
    if not url:
        print('URL이 입력되지 않았습니다.')
        return

    input_type = input('1-TXT, 2-HTML: ')
    if input_type not in ['1', '2']:
        print('잘못된 입력 타입')
        return

    output_unit = input('출력 묶음 단위(자연수): ')
    if r_non_digit.findall(output_unit):
        print('자연수가 아닌 값 입력')
        return

    output_unit = int(output_unit)

    text = crawl(url)
    if input_type == '2':
        text = remove_html_tag(text)

    text = remove_non_alnum(text)
    text = sort_ascending(text)
    shares, rest = divide_text(text, output_unit)

    print('몫: ', ', '.join(shares))
    print('나머지: ', rest)
Exemplo n.º 9
0
def spider_scheduling(SpiderGlobalVariable, UrlRule):
    '''
    SpiderGlobalVariable
    '''
    for i in init_urlnode(SpiderGlobalVariable.start_url, UrlRule):
        SpiderGlobalVariable.global_urlnode_queue.put((0, i))

    while exit_condition(SpiderGlobalVariable):
        if SpiderGlobalVariable.htmlnode_queue.qsize() > 0:
            html_node = SpiderGlobalVariable.htmlnode_queue.get()
            linklist = crawl(html_node.url, html_node.html)
            for i in linklist:
                url = i[1]
                method = i[0]
                data = i[2]
                depth = html_node.depth
                referer = html_node.url
                i = UrlNode(url, referer, depth, method, data)

                if i.depth <= SpiderGlobalVariable.depth and UrlRule.check_url(
                        i.check_url):
                    if is_netloc(i.url):
                        SpiderGlobalVariable.global_urlnode_queue.put((0, i))
                    else:
                        SpiderGlobalVariable.global_urlnode_queue.put(
                            (random.randint(1, 5), i))

                else:
                    SpiderGlobalVariable.refuse_count += 1
Exemplo n.º 10
0
def spider_scheduling(SpiderGlobalVariable,UrlRule):
    '''
    SpiderGlobalVariable
    '''
    for i in init_urlnode(SpiderGlobalVariable.start_url,UrlRule):
        SpiderGlobalVariable.global_urlnode_queue.put((0,i))

    while exit_condition(SpiderGlobalVariable):
        if SpiderGlobalVariable.htmlnode_queue.qsize() > 0:
            html_node = SpiderGlobalVariable.htmlnode_queue.get()
            linklist = crawl(html_node.url, html_node.html)
            for i in linklist:
                url = i[1]
                method = i[0]
                data = i[2]
                depth = html_node.depth
                referer = html_node.url
                i = UrlNode(url, referer, depth, method, data)

                if i.depth <= SpiderGlobalVariable.depth and UrlRule.check_url(i.check_url):
                    if is_netloc(i.url):
                        SpiderGlobalVariable.global_urlnode_queue.put((0,i))
                    else:
                        SpiderGlobalVariable.global_urlnode_queue.put((random.randint(1,5),i))

                else:
                    SpiderGlobalVariable.refuse_count += 1
Exemplo n.º 11
0
def search(request):
    if request.method == "POST":
        keyword = request.POST.get("keyword")
        print(keyword)
        url = "http://www.enuri.com/search.jsp?nosearchkeyword=&issearchpage=&searchkind=&es=&c=&ismodelno=false&hyphen_2=false&from=list&owd=&keyword={0}".format(
            keyword)
        makewordcloud.cloud(sementic_classifier.test(crawl.crawl(url)))
        return render(request, 'search.html')
Exemplo n.º 12
0
 def run(self):
     # time.sleep(0.3)
     # 测试,使用相同数据
     # is_ok, content, satellite_data = crawl_test(self.satellite_data, self.model, "", self.create_time, self.end_time)
     is_ok, content, satellite_data = crawl(self.satellite_data,
                                            self.username, self.password,
                                            self.model, self.telemetry_num,
                                            self.create_time, self.end_time)
     self._signal.emit((is_ok, content, satellite_data))
Exemplo n.º 13
0
def main():
    elastic = Elastic()
    config = get_params()
    path = config['crawler']['path']

    # download imdb files
    wget.download(BASICS, out=path)
    wget.download(RATINGS, out=path)
    wget.download(EPISODES, out=path)

    # crawl movies and add to elastic
    crawl()
    elastic.insert_elastic()

    # remove files
    os.remove(f"{config['crawler']['path']}/title.basics.tsv.gz")
    os.remove(f"{config['crawler']['path']}/title.ratings.tsv.gz")
    os.remove(f"{config['crawler']['path']}/title.episode.tsv.gz")
Exemplo n.º 14
0
def proc(u, epoch):
    ip = u["ip"]
    user = u["id"]
    service = SERVICE_ID
    flag = generateFlag(user, service, epoch)
    point = crawl(ip, flag)
    print("epoch:{} user:{} service:{} ip:{} is {}pt".format(
        epoch, user, service, ip, point))
    sendStatus(user, service, epoch, point)
Exemplo n.º 15
0
def MainXSS():
    print PrintXssMask()

    output = MakeSelection()

    if output is None:
        return

    for rhost in crawl():
        StartAttack(output, rhost)
Exemplo n.º 16
0
 def crawl(self):
     print("Crawling")
     #the crawl function performs a hash index of all files in the target directories
     parser = argparse.ArgumentParser(
         description='Crawl the dirs and create a hash index')
     parser.add_argument('-f',
                         '--force',
                         action='store_true',
                         default=False,
                         help='forces indexing of all directories')
     parser.add_argument('-p', '--path', help='pass a directory to crawl')
     args = parser.parse_args(sys.argv[2:])
     #Crawl a provided directory
     if args.path:
         crawl.crawl(args.force, args.path, jsondatadir)
     else:
         ingestsum = crawl.crawl(args.force, ingestdir, jsondatadir)
         workingdirsum = crawl.crawl(args.force, workingdir, jsondatadir)
         archivedirsum = crawl.crawl(args.force, archivedir, jsondatadir)
Exemplo n.º 17
0
def do():
    target = int(request.form.get("twa"))
    result = dict()
    twa = []
    title = []
    url = []
    twa, title, url = crawl(target)
    result["twa"] = twa
    result["title"] = title
    result["url"] = url
    print(result)
    return jsonify(result)
Exemplo n.º 18
0
def getData():
    words = crawl(request.form["url"])
    documents.append(words)

    global data
    data = sorted(words.items(), key=operator.itemgetter(1), reverse=True)

    dataResult = ""
    for d in data:
        dataResult += getDataTag(d)

    return render_template("result.html", res = f"<Word : Word Num>\n\n{dataResult}")
def pbsc(username):
    ress = []
    ressExtro = 'Normal'  # resulf of prediction
    ressAgree = 'Normal'
    ressCons = 'Normal'
    crawl.crawl(username)
    clr1.clean1(username)
    clr2.clean2(username)
    # logic Extroversion
    lenTW = extro.lenTW(username)
    if lenTW != 0:
        extroValue = extro.driver(username)
        if extroValue[1] >= pExto and extroValue[3] <= qExtro:
            ressExtro = 'High'
        elif extroValue[3] >= qExtro and extroValue[1] <= pExto:
            ressExtro = 'Low'

        # logic Agreeableness
        agreValue = agree.driver(username)
        if agreValue[1] >= pAgre and agreValue[3] <= qAgre:
            ressAgree = 'High'
        elif agreValue[3] >= qAgre and agreValue[1] <= pAgre:
            ressAgree = 'Low'

        # logic Conscientiousnes
        conValue = cons.driver(username)
        # print(conValue)
        if conValue[1] >= pCons and conValue[3] <= qCons:
            ressCons = 'High'
        elif conValue[3] >= qCons and conValue[1] <= pCons:
            ressCons = 'Low'
    else:
        ressAgree = 'Normal'
        ressCons = 'Normal'
        ressExtro = 'Normal'
    ress.append(username)
    ress.append(ressExtro)
    ress.append(ressAgree)
    ress.append(ressCons)
    return ress
Exemplo n.º 20
0
def Test():
    TITLE, URL = crawl.crawl(inUrl,depth)
    for i in range(0,len(URL)):
        print "Title: ", TITLE[i], "\t", "URL: ", URL[i]
        print ' Check the correctness of this Title:'
        print '  is_alpha(TITLE[i]) and TITLE[i].isalpha(): '
        print '  ',is_alpha(TITLE[i]),TITLE[i].isalpha()
        print '  is_alnum(TITLE[i]) and TITLE[i].isalnum(): '
        print '  ',is_alnum(TITLE[i]), TITLE[i].isalnum()
        print '  startswith(TITLE[i],TITLE[i]) and TITLE[i].startswith(TITLE[i]):'
        print '  ',startswith(TITLE[i],TITLE[i]), TITLE[i].startswith(TITLE[i])
        print '  is_in(TITLE[i],TITLE[i]) and TITLE[i] in TITLE[i]'
        print '  ',is_in(TITLE[i],TITLE[i]), TITLE[i] in TITLE[i]
        print '\n'
Exemplo n.º 21
0
    def test(self):
        print("Running Full Test Sequence")
        #the ingest function sorts and moves files by date into the working/media directory
        ingest.ingest(ingestdir, workingdir)

        #the crawl function performs a hash index of all files in the target directories
        workingdirsum = crawl.crawl(True, workingdir, jsondatadir)
        archivedirsum = crawl.crawl(False, archivedir, jsondatadir)

        #the dedupe function combines all hash indexes and analyzes the dataset for duplicates
        data_files = glob.glob(jsondatadir + '/*.json')
        #run the dedupe function
        dedupe.dedupe(data_files, duplicatedir)

        #after the dedupe function has moved duplicaes out, reindex
        workingdirsum = crawl.crawl(True, workingdir, jsondatadir)

        #the archive function pulls from the working/media directory and pools into sized volumes
        archive.archive(archivedir, jsondatadir, workingdir, mediasize)

        #validate that all files in duplicates exist elsewhere before moving to validated
        validate.validate(duplicatedir, workingdir, archivedir, validateddir)

        print("Daily Job Completed Successfully")
Exemplo n.º 22
0
def print_result():
    if request.method == 'POST':
        url = request.form['url']
        input_type = request.form['type']
        output_unit = int(request.form['unit'])

        text = crawl(url)
        if input_type.upper() == 'HTML':
            text = remove_html_tag(text)

        text = remove_non_alnum(text)
        text = sort_ascending(text)
        shares, rest = divide_text(text, output_unit)

        return render_template('main.html', shares=shares, rest=rest)
    else:
        return redirect(url_for('main_page'))
Exemplo n.º 23
0
    def run(self):
        """  deploys bundled basicnode module to plab nodes using plab_assistant.
         crawls the ring continuously for 48 hours and then gathers logs. """
        plab = plab_assistant.plab_assistant("install", nodes=None, username=self.slice_name, \
            path_to_files=self.path_to_files, ssh_key=self.ssh_key)
        plab.run()
        os.chdir(self.base_path + "/node")
        os.system("sed 's/<Enabled>false/<Enabled>true/' -i node.config." +
                  self.slice_name)
        # start local basicnode without cronolog...
        start_basicnode = "./basicnode node.config." + self.slice_name + " &> log &"
        p = subprocess.Popen(start_basicnode.split(' '))
        self.local_basicnode_pid = p.pid
        time.sleep(60)

        os.chdir(self.base_path)
        start_utc = datetime.datetime.utcnow()
        test_length = datetime.timedelta(hours=48)
        content = open("node/node.config." + self.slice_name).read()
        port_line = re.search("<XmlRpcManager>.*</XmlRpcManager>", content,
                              re.S).group()
        port = int(re.search("\d+", port_line).group())
        while datetime.datetime.utcnow() - start_utc < test_length:
            nodes = crawl.crawl(port)
            consistency, count = crawl.check_results(nodes)
            os.chdir(self.base_path)
            f = open("crawl.csv", "a")
            f.write(
                str(time.asctime()) + ", " + str(consistency) + ", " +
                str(count) + "\n")
            f.close()
            time.sleep(60 * 15)
        # done with the test, start getting logs and cleaning up.
        plab = plab_assistant.plab_assistant("get_logs", nodes=None, username=self.slice_name, \
            path_to_files=self.path_to_files, ssh_key=self.ssh_key)
        plab.run()
        os.system("zip -r9 results.zip logs output.log crawl.csv")
        # Actually not necessary because installation cleans nodes first.
        plab = plab_assistant.plab_assistant("uninstall", nodes=None, username=self.slice_name, \
            path_to_files=self.path_to_files, ssh_key=self.ssh_key)
        plab.run()
        try:
            os.kill(self.local_basicnode_pid, signal.SIGKILL)
        except:
            pass
Exemplo n.º 24
0
  def run(self):
    """  deploys bundled basicnode module to plab nodes using plab_assistant.
         crawls the ring continuously for 48 hours and then gathers logs. """
    plab = plab_assistant.plab_assistant("install", nodes=None, username=self.slice_name, \
        path_to_files=self.path_to_files, ssh_key=self.ssh_key)
    plab.run()
    os.chdir(self.base_path + "/node")
    os.system("sed 's/<Enabled>false/<Enabled>true/' -i node.config." + self.slice_name)
    # start local basicnode without cronolog...
    start_basicnode = "./basicnode node.config." + self.slice_name + " &> log &"
    p = subprocess.Popen(start_basicnode.split(' '))
    self.local_basicnode_pid = p.pid
    time.sleep(60)

    os.chdir(self.base_path)
    start_utc = datetime.datetime.utcnow()
    test_length = datetime.timedelta(hours = 48)
    content = open("node/node.config." + self.slice_name).read()
    port_line = re.search("<XmlRpcManager>.*</XmlRpcManager>", content, re.S).group()
    port =  int(re.search("\d+", port_line).group())
    while datetime.datetime.utcnow() - start_utc < test_length:
      nodes = crawl.crawl(port)
      consistency, count = crawl.check_results(nodes)
      os.chdir(self.base_path)
      f = open("crawl.csv", "a")
      f.write(str(time.asctime()) + ", " + str(consistency) + ", " + str(count) + "\n")
      f.close()
      time.sleep(60 * 15)
    # done with the test, start getting logs and cleaning up.
    plab = plab_assistant.plab_assistant("get_logs", nodes=None, username=self.slice_name, \
        path_to_files=self.path_to_files, ssh_key=self.ssh_key)
    plab.run()
    os.system("zip -r9 results.zip logs output.log crawl.csv")
    # Actually not necessary because installation cleans nodes first.
    plab = plab_assistant.plab_assistant("uninstall", nodes=None, username=self.slice_name, \
        path_to_files=self.path_to_files, ssh_key=self.ssh_key)
    plab.run()
    try:
      os.kill(self.local_basicnode_pid, signal.SIGKILL)
    except:
      pass
Exemplo n.º 25
0
 def crawler(self):
     web_tb = self.webvar1.get()
     web_jd = self.webvar2.get()
     web_am = self.webvar3.get()
     web_all = self.webvar4.get()
     depth = self.dp_var.get()
     goods = self.goods_var.get()
     cr = crawl.crawl()
     if web_all == 1:
         cr.crawler_all(depth, goods)
         messagebox.showinfo(title='result', message='全部爬取完成')
     else:
         if web_jd == 1:
             cr.crawler_jd(depth, goods)
             messagebox.showinfo(title='result', message='jd爬取完成')
         if web_tb == 1:
             cr.crawler_tb(depth, goods)
             messagebox.showinfo(title='result', message='tb爬取完成')
         if web_am == 1:
             cr.crawler_am(depth, goods)
             messagebox.showinfo(title='result', message='TMALL爬取完成')
Exemplo n.º 26
0
def main():
  try:
    optlist, args = getopt.getopt(sys.argv[1:], "", ["debug", "port="])

    port = 10000
    debug = False

    for k,v in optlist:
      if k == "--port":
        port = int(v)
      elif k == "--debug":
        debug = True
  except:
    print usage
    return

  nodes = crawl.crawl(port, debug = debug) 
  print crawl.check_results(nodes)

  for task in tasks:
    print task.run(port, nodes, debug)
Exemplo n.º 27
0
def crawling():
    while True:
        print(time.ctime())
        crawl()
        time.sleep(10)
Exemplo n.º 28
0
from apscheduler.schedulers.background import BackgroundScheduler
import pandas as pd
import time
import crawl
import datetime
import mail
from configparser import ConfigParser

cfg = ConfigParser()
cfg.read('config.ini')

fmk_news = crawl.crawl(url=cfg["fmk_news"]["url"],
                       title=cfg["fmk_news"]["title"],
                       time_=cfg["fmk_news"]["time_"],
                       site=cfg["fmk_news"]["site"],
                       category="News",
                       sub="\[[0-9]{1,4}\]")
fmk_fashion = crawl.crawl(url=cfg["fmk_fashion"]["url"],
                          title=cfg["fmk_fashion"]["title"],
                          time_=cfg["fmk_fashion"]["time_"],
                          site=cfg["fmk_fashion"]["site"],
                          category="Fashion",
                          sub="\[[0-9]{1,4}\]")
naver_finance_news = crawl.crawl(url=cfg["naver_finance_news"]["url"],
                                 title=cfg["naver_finance_news"]["title"],
                                 time_=cfg["naver_finance_news"]["time_"],
                                 site=cfg["naver_finance_news"]["site"],
                                 address=cfg["naver_finance_news"]["address"],
                                 category="Finance",
                                 sub="\n[\s\S]*")
Exemplo n.º 29
0
def scheduled_job():
    print('Crawl script is run everyday at 6am [timezone = +7.00]')
    crawl()
    print("Craw: Done")
Exemplo n.º 30
0
import tweepy

class TwitterAPI:
    """
    Class for accessing the Twitter API.

    Requires API credentials to be available in environment
    variables. These will be set appropriately if the bot was created
    with init.sh included with the heroku-twitterbot-starter
    """
    def __init__(self):
        consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
        consumer_secret = os.environ.get('TWITTER_CONSUMER_SECRET')
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        access_token = os.environ.get('TWITTER_ACCESS_TOKEN')
        access_token_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')
        auth.set_access_token(access_token, access_token_secret)
        self.api = tweepy.API(auth)

    def tweet(self, message):
        """Send a tweet"""
        self.api.update_status(message)

if __name__ == "__main__":
    twitter = TwitterAPI()
    while True:
        tweet = crawl()
        if tweet:
            twitter.tweet(tweet)
        time.sleep(21600 + random.randint(0, 21600)) # wait 6-12 hours
Exemplo n.º 31
0
#-*- encoding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import  Queue
import thread
from bs4 import BeautifulSoup
urllis_q = Queue.Queue()
from crawl import crawl
import time 

crawler_task1 = crawl()
crawler_task2 = crawl()


def list_zhidao_url():
    url = '''http://zhidao.baidu.com/search?word=%D0%A1%C0%B1%BD%B7%CA%D6%BB%FA%D4%F5%C3%B4%D1%F9&ie=gbk&site=-1&sites=0&date=0&pn='''
    for i in range(76):
        yield url + str(i*10)


def list_zhidao_url_xiaomi():
    url = '''http://zhidao.baidu.com/search?word=%D0%A1%C3%D7%CA%D6%BB%FA%D4%F5%C3%B4%D1%F9&ie=gbk&site=-1&sites=0&date=0&pn='''
    for i in range(76):
        yield url + str(i*10)


def list_zhidao_url_coolpad():
    url = '''http://zhidao.baidu.com/search?word=%BF%E1%C5%C9%B4%F3%C9%F1%D4%F5%C3%B4%D1%F9&ie=gbk&site=-1&sites=0&date=0&pn='''
    for i in range(73):
        yield url + str(i*10)
Exemplo n.º 32
0
from crawl import crawl
import re

c = crawl()
c.set_root_link('https://www.gog.com/')
c.set_filename('gogData.db')
c.set_link_checker(lambda s: '/game/' in s)
c.set_link_rule(lambda link: re.search(".*www\.gog\.com.*", link) is not None)
c.set_content_rule('name', lambda s: s.find('h1', {
    'class': 'header__title'
}).text)
c.set_content_rule(
    'price', lambda s: str(
        float(
            re.search(
                '<span\ class="_price">(\ |\n)+([0-9]|\.)+(\ |\n)+</span>',
                s.prettify()).group()[21:-7].strip()) * 30))
c.set_content_rule(
    'discount', lambda s: str(
        float(s.find('span', {
            'class': 'buy-price__new'
        }).text) * 30))
c.set_content_rule(
    'feature', lambda s: ", ".join([
        i.text.strip()
        for i in s.findAll('span', {'class': 'game-features__title'})
    ]))

c.run(1)
Exemplo n.º 33
0
import sys
import webbrowser
import re
import time
import crawl

SHOP_URL = 'http://www.ecocion-shop.de/web/main.php/shop/addCart?live_pid='
DELAY = 3

if __name__ == '__main__':
    with open(sys.argv[1], 'r') as input_file:
        fresh_item_list = ['empty row']
        crawl.crawl(fresh_item_list, fresh_item_list, crawl.FRESH_CATEGORIES_LIST)
        if 'frisch' not in input_file.name:
            dry_item_list = ['empty row']
            crawl.crawl(fresh_item_list, dry_item_list, crawl.ALL_ITEMS)

        with open(crawl.DRY_ARTICLES_CSV, 'r') as dry_file:  # for comparison to detect price changes
            old_dry_item_list = dry_file.readlines()

        # find first occurrence of 'Nummer    Menge   Name'
        order = input_file.read().split('Nummer    Menge   Name', 1)[1]

        # split into lines
        lines = order.split('\n')

        ask_to_proceed = False
        id_count_name_list = []
        for line in lines:
            splitted_line = re.split(r' +', line)  # e.g. ['', '392459', '2', 'Bioland', 'Joghurt', 'Natur', '1L']
            if len(splitted_line) > 3:
Exemplo n.º 34
0
     create_dirs("temp/")
     #Logging
     sys.stdout = Logger()
     #Check Python version
     print("Using Python version " +\
           str(sys.version_info.major) + "." +\
           str(sys.version_info.minor) + "." +\
           str(sys.version_info.micro) + " " +\
           sys.version_info.releaselevel + " " +\
           str(int(round(log(sys.maxint * 2 + 2, 2)))) + "bit")
     if sys.version_info.major != 2:
         print("Not supported; use Python 2")
     elif 0:
         print("")
         #start
         crawler = crawl()
         crawler.start()
         try:
             while(True): time.sleep(2)
         except KeyboardInterrupt:
             try:
                 print("    -> Closing program")
                 time.sleep(5)
                 print("    -> Analyzing")
             except KeyboardInterrupt:
                 print("    -> Skipping analysis")
                 crawler.quit_analyze = False
             crawler.quit = True
             crawler.join()
     print("I don't think there is a need to run another Steam crawler")
 raw_input("Finished")
Exemplo n.º 35
0
            url = urllib.parse.urlparse(line.strip().split(',', 1)[0])
            assert url.scheme.startswith('http') 
            str_url = '{}://{}/'.format(url.scheme, url.netloc)
            if str_url not in unique_urls:
                unique_urls.add(str_url)
                queue.put(str_url)


os.makedirs('data/output', exist_ok=True)

links_queue = queue.Queue()
NUM_THREADS=1

threads = []
for i in range(NUM_THREADS):
    threads.append(threading.Thread(target=run, args=(i, links_queue)))
    threads[-1].start()


if len(sys.argv) < 2:
    crawl.crawl(links_queue)
else:
    read_links_from_file(sys.argv[1], links_queue)

for i in range(NUM_THREADS):
    links_queue.put(None)

for thread in threads:
    thread.join()

Exemplo n.º 36
0
import crawl

soup = crawl.crawl("https://www.ii.uni.wroc.pl/~marcinm/", 13, crawl.action)
soup2 = crawl.crawl("https://pl.wikipedia.org/wiki/Python", 3, crawl.action)
soup3 = crawl.crawl('https://www.ii.uni.wroc.pl/~marcinm/dyd/python/', 3,
                    crawl.action)

print("\n\nII.UNI.WROC.PL/~MARCINM\n\n")
for i in soup:
    print(i)
print("\n\nWIKIPEDIA\n\n")
for i in soup2:
    print(i)
print("\n\nDYD/PYTHON/\n\n")
for i in soup3:
    print(i)
"""
g1, g2- generator

def sum (g1, g2):
    for i in g1:
        yield i
    for i in g2:
        yield i

def sum (g1, g2):
    yield from g1
    yield from g2

def f():
    yield <...>
Exemplo n.º 37
0
	def setUp(self):
		self.crawl = crawl(path)
Exemplo n.º 38
0
# -*- coding: utf-8 -*-
__author__ = 'Taikor'


from crawl import crawl
from output_excel import output_excel
from copyfile import copyfile

if __name__ == "__main__":
    crawl()
    output_excel()
    copyfile(r'C:\xampp\htdocs\cosmetic_price_check\data\jd\MianBuDiShuang')
    print("Crawler Task Completed !")


Exemplo n.º 39
0
def crawl_train_schedules():
    debug("Crawling Train schedules")
    count = db.query(func.count(TempTrain.id)).scalar()
    for i, row in enumerate(db.query(TempTrain.url)):
        debug("Crawing %s of %s. Remaining %s" % (i, count, count-i))
        crawl(row.url)
Exemplo n.º 40
0
def route_generate():

	crawl.crawl()

	return redirect(url_for("route_download"))
Exemplo n.º 41
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 13 10:17:10 2018

@author: redman
"""
import requests
from bs4 import BeautifulSoup
import argparse
import crawl
import push
import popular
import findword
import sys

if __name__ == '__main__':

    if (sys.argv[1] == 'crawl'):
        #print('boo')
        crawl.crawl()
    if (sys.argv[1] == 'push'):
        #print('good')
        push.countpush(sys.argv[2], sys.argv[3])
    if (sys.argv[1] == 'popular'):
        popular.countpopular(sys.argv[2], sys.argv[3])
    if (sys.argv[1] == 'findword'):
        #print('hello')
        #print(sys.argv[2])
        findword.word(sys.argv[3], sys.argv[4], sys.argv[2])
Exemplo n.º 42
0
def server():
    condition = request.args.get('keyword')
    return crawl(condition)