Exemplos de crawl em Python, exemplos de crawl.crawl em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: app.py Projeto: geunseung/PageCrawlPythonFlask

def getSimilarity():
    words1 = crawl(request.form["url1"])
    documents.append(words1)
    data1 = sorted(words1.items(), key=operator.itemgetter(1), reverse=True)

    words2 = crawl(request.form["url2"])
    documents.append(words2)
    data2 = sorted(words2.items(), key=operator.itemgetter(1), reverse=True)

    similarity = getCosineSimilarity(documents, words1, words2)

    commonData = []
    for d in data1:
        try:
            commonData.append((d[0], d[1]+data2[findIndex(d[0], data2)][1]))
        except ValueError:
            print("", end="")

    commonData.sort(key = lambda word : word[1], reverse=True)

    dataResult = ""
    for d in commonData:
        dataResult += getDataTag(d)

    return render_template("result.html", res = f"<Similarity> : {similarity}\n\n<Common Word : Word Num>\n\n{dataResult}")

Exemplo n.º 2

0

Exibir arquivo

Arquivo: tool.py Projeto: FMakosza/prbx-tracking

def runCrawl(url_list, db_name='crawl-data', manual=True, append=False):
    '''
    Basic wrapper to OpenWPM crawl script. 
        url_list is a list of URLs to crawl: ['https://abc.com', 
            'https://xyz.net'] etc
        db_name is the name of the database to write crawl output to
        manual can be set to False to skip the crawl data deletion warning
        append can be set to True to append to existing crawl data instead of
    deleting it
    '''
    if manual and not append:
        go = input(
            "Proceeding will delete the old crawl data. Enter Y to continue.\n"
        )
    else:
        go = 'y'

    if go.lower() == 'y':
        if not append:
            try:
                os.remove('datadir/{}.sqlite'.format(db_name))
            except FileNotFoundError:
                print("Old crawl data not found. Continuing.")

        crawl.crawl(url_list, db_name)
    else:
        print("Aborting.")

Exemplo n.º 3

0

Exibir arquivo

Arquivo: __main__.py Projeto: joeljohnston/mediastruct

    def daily(self):
        print("Running Daily Job")
        #the ingest function sorts and moves files by date into the working/media directory
        ingest.ingest(ingestdir, workingdir)

        #the crawl function performs a hash index of all files in the target directories
        workingdirsum = crawl.crawl(True, workingdir, jsondatadir)
        archivedirsum = crawl.crawl(False, archivedir, jsondatadir)

        #the dedupe function combines all hash indexes and analyzes the dataset for duplicates
        data_files = glob.glob(jsondatadir + '/*.json')
        #run the dedupe function
        dedupe.dedupe(data_files, duplicatedir)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: daemon.py Projeto: oToToT/NTU-Announcement-Crawler

def daemon_job(interval):
    """
        main function of the crawler daemon.
        interval: time in seconds between each crawl
    """
    time.sleep(3)  # Wait for api server to start first
    while True:
        try:
            crawl()
            process_notification()
        except Exception:
            traceback.print_exc()
        time.sleep(interval)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: main.py Projeto: popart/basicCrawl

def main(argv=None):
    try:
        db_cnx = db_conn.db_conn()
        url = sys.argv[1]

        crawl.crawl(url, db_cnx)

        db_cnx.close()
    except IndexError:
        print "Please run with a url (ex. \"python main.py http://www.example.com\")"
    except KeyboardInterrupt:
        print "Stopped crawling on interrupt."
        db_cnx.close()

Exemplo n.º 6

0

Exibir arquivo

def main():
    db_dirname = './data/'
    try:
        os.mkdir(db_dirname)
    except FileExistsError:
        pass
    get_web_req_gap = (eval('lambda : ' + sys.argv[1])
                       if len(sys.argv) > 1 else lambda: random.uniform(1, 3))

    try:
        crawl.init(db_dirname + 'db.tinydb', get_web_req_gap)
        crawl.authorize()
        crawl.crawl(seeds=crawl.url2json('https://api.github.com/users'))
    except Exception as e:
        crawl.cleanup()
        raise e

Exemplo n.º 7

0

Exibir arquivo

Arquivo: main.py Projeto: jmzhao/crawling-github

def main() :
    db_dirname = './data/'
    try :
        os.mkdir(db_dirname)
    except FileExistsError :
        pass
    get_web_req_gap = (eval('lambda : ' + sys.argv[1]) if len(sys.argv)>1 
        else lambda : random.uniform(1,3))

    try :
        crawl.init(db_dirname + 'db.tinydb', get_web_req_gap)
        crawl.authorize()
        crawl.crawl(seeds=crawl.url2json('https://api.github.com/users'))
    except Exception as e :
        crawl.cleanup()
        raise e

Exemplo n.º 8

0

Exibir arquivo

Arquivo: run_console.py Projeto: kimwansu/numengfilter

def main():
    url = input('URL: ')
    if not url:
        print('URL이 입력되지 않았습니다.')
        return

    input_type = input('1-TXT, 2-HTML: ')
    if input_type not in ['1', '2']:
        print('잘못된 입력 타입')
        return

    output_unit = input('출력 묶음 단위(자연수): ')
    if r_non_digit.findall(output_unit):
        print('자연수가 아닌 값 입력')
        return

    output_unit = int(output_unit)

    text = crawl(url)
    if input_type == '2':
        text = remove_html_tag(text)

    text = remove_non_alnum(text)
    text = sort_ascending(text)
    shares, rest = divide_text(text, output_unit)

    print('몫: ', ', '.join(shares))
    print('나머지: ', rest)

Exemplo n.º 9

0

Exibir arquivo

def spider_scheduling(SpiderGlobalVariable, UrlRule):
    '''
    SpiderGlobalVariable
    '''
    for i in init_urlnode(SpiderGlobalVariable.start_url, UrlRule):
        SpiderGlobalVariable.global_urlnode_queue.put((0, i))

    while exit_condition(SpiderGlobalVariable):
        if SpiderGlobalVariable.htmlnode_queue.qsize() > 0:
            html_node = SpiderGlobalVariable.htmlnode_queue.get()
            linklist = crawl(html_node.url, html_node.html)
            for i in linklist:
                url = i[1]
                method = i[0]
                data = i[2]
                depth = html_node.depth
                referer = html_node.url
                i = UrlNode(url, referer, depth, method, data)

                if i.depth <= SpiderGlobalVariable.depth and UrlRule.check_url(
                        i.check_url):
                    if is_netloc(i.url):
                        SpiderGlobalVariable.global_urlnode_queue.put((0, i))
                    else:
                        SpiderGlobalVariable.global_urlnode_queue.put(
                            (random.randint(1, 5), i))

                else:
                    SpiderGlobalVariable.refuse_count += 1

Exemplo n.º 10

0

Exibir arquivo

Arquivo: scheduling.py Projeto: Faith4444/MSpider

def spider_scheduling(SpiderGlobalVariable,UrlRule):
    '''
    SpiderGlobalVariable
    '''
    for i in init_urlnode(SpiderGlobalVariable.start_url,UrlRule):
        SpiderGlobalVariable.global_urlnode_queue.put((0,i))

    while exit_condition(SpiderGlobalVariable):
        if SpiderGlobalVariable.htmlnode_queue.qsize() > 0:
            html_node = SpiderGlobalVariable.htmlnode_queue.get()
            linklist = crawl(html_node.url, html_node.html)
            for i in linklist:
                url = i[1]
                method = i[0]
                data = i[2]
                depth = html_node.depth
                referer = html_node.url
                i = UrlNode(url, referer, depth, method, data)

                if i.depth <= SpiderGlobalVariable.depth and UrlRule.check_url(i.check_url):
                    if is_netloc(i.url):
                        SpiderGlobalVariable.global_urlnode_queue.put((0,i))
                    else:
                        SpiderGlobalVariable.global_urlnode_queue.put((random.randint(1,5),i))

                else:
                    SpiderGlobalVariable.refuse_count += 1

Exemplo n.º 11

0

Exibir arquivo

Arquivo: views.py Projeto: Minu-kr/SearchHelper

def search(request):
    if request.method == "POST":
        keyword = request.POST.get("keyword")
        print(keyword)
        url = "http://www.enuri.com/search.jsp?nosearchkeyword=&issearchpage=&searchkind=&es=&c=&ismodelno=false&hyphen_2=false&from=list&owd=&keyword={0}".format(
            keyword)
        makewordcloud.cloud(sementic_classifier.test(crawl.crawl(url)))
        return render(request, 'search.html')

Exemplo n.º 12

0

Exibir arquivo

 def run(self):
     # time.sleep(0.3)
     # 测试，使用相同数据
     # is_ok, content, satellite_data = crawl_test(self.satellite_data, self.model, "", self.create_time, self.end_time)
     is_ok, content, satellite_data = crawl(self.satellite_data,
                                            self.username, self.password,
                                            self.model, self.telemetry_num,
                                            self.create_time, self.end_time)
     self._signal.emit((is_ok, content, satellite_data))

Exemplo n.º 13

0

Exibir arquivo

def main():
    elastic = Elastic()
    config = get_params()
    path = config['crawler']['path']

    # download imdb files
    wget.download(BASICS, out=path)
    wget.download(RATINGS, out=path)
    wget.download(EPISODES, out=path)

    # crawl movies and add to elastic
    crawl()
    elastic.insert_elastic()

    # remove files
    os.remove(f"{config['crawler']['path']}/title.basics.tsv.gz")
    os.remove(f"{config['crawler']['path']}/title.ratings.tsv.gz")
    os.remove(f"{config['crawler']['path']}/title.episode.tsv.gz")

Exemplo n.º 14

0

Exibir arquivo

def proc(u, epoch):
    ip = u["ip"]
    user = u["id"]
    service = SERVICE_ID
    flag = generateFlag(user, service, epoch)
    point = crawl(ip, flag)
    print("epoch:{} user:{} service:{} ip:{} is {}pt".format(
        epoch, user, service, ip, point))
    sendStatus(user, service, epoch, point)

Exemplo n.º 15

0

Exibir arquivo

def MainXSS():
    print PrintXssMask()

    output = MakeSelection()

    if output is None:
        return

    for rhost in crawl():
        StartAttack(output, rhost)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: __main__.py Projeto: joeljohnston/mediastruct

 def crawl(self):
     print("Crawling")
     #the crawl function performs a hash index of all files in the target directories
     parser = argparse.ArgumentParser(
         description='Crawl the dirs and create a hash index')
     parser.add_argument('-f',
                         '--force',
                         action='store_true',
                         default=False,
                         help='forces indexing of all directories')
     parser.add_argument('-p', '--path', help='pass a directory to crawl')
     args = parser.parse_args(sys.argv[2:])
     #Crawl a provided directory
     if args.path:
         crawl.crawl(args.force, args.path, jsondatadir)
     else:
         ingestsum = crawl.crawl(args.force, ingestdir, jsondatadir)
         workingdirsum = crawl.crawl(args.force, workingdir, jsondatadir)
         archivedirsum = crawl.crawl(args.force, archivedir, jsondatadir)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: test.py Projeto: njjack/cloud2020

def do():
    target = int(request.form.get("twa"))
    result = dict()
    twa = []
    title = []
    url = []
    twa, title, url = crawl(target)
    result["twa"] = twa
    result["title"] = title
    result["url"] = url
    print(result)
    return jsonify(result)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: app.py Projeto: geunseung/PageCrawlPythonFlask

def getData():
    words = crawl(request.form["url"])
    documents.append(words)

    global data
    data = sorted(words.items(), key=operator.itemgetter(1), reverse=True)

    dataResult = ""
    for d in data:
        dataResult += getDataTag(d)

    return render_template("result.html", res = f"<Word : Word Num>\n\n{dataResult}")

Exemplo n.º 19

0

Exibir arquivo

Arquivo: pbsc.py Projeto: joshuapanjaitan/Personality-Prediction-Using-PbSC

def pbsc(username):
    ress = []
    ressExtro = 'Normal'  # resulf of prediction
    ressAgree = 'Normal'
    ressCons = 'Normal'
    crawl.crawl(username)
    clr1.clean1(username)
    clr2.clean2(username)
    # logic Extroversion
    lenTW = extro.lenTW(username)
    if lenTW != 0:
        extroValue = extro.driver(username)
        if extroValue[1] >= pExto and extroValue[3] <= qExtro:
            ressExtro = 'High'
        elif extroValue[3] >= qExtro and extroValue[1] <= pExto:
            ressExtro = 'Low'

        # logic Agreeableness
        agreValue = agree.driver(username)
        if agreValue[1] >= pAgre and agreValue[3] <= qAgre:
            ressAgree = 'High'
        elif agreValue[3] >= qAgre and agreValue[1] <= pAgre:
            ressAgree = 'Low'

        # logic Conscientiousnes
        conValue = cons.driver(username)
        # print(conValue)
        if conValue[1] >= pCons and conValue[3] <= qCons:
            ressCons = 'High'
        elif conValue[3] >= qCons and conValue[1] <= pCons:
            ressCons = 'Low'
    else:
        ressAgree = 'Normal'
        ressCons = 'Normal'
        ressExtro = 'Normal'
    ress.append(username)
    ress.append(ressExtro)
    ress.append(ressAgree)
    ress.append(ressCons)
    return ress

Exemplo n.º 20

0

Exibir arquivo

Arquivo: titletest.py Projeto: DataMonster/Python

def Test():
    TITLE, URL = crawl.crawl(inUrl,depth)
    for i in range(0,len(URL)):
        print "Title: ", TITLE[i], "\t", "URL: ", URL[i]
        print ' Check the correctness of this Title:'
        print '  is_alpha(TITLE[i]) and TITLE[i].isalpha(): '
        print '  ',is_alpha(TITLE[i]),TITLE[i].isalpha()
        print '  is_alnum(TITLE[i]) and TITLE[i].isalnum(): '
        print '  ',is_alnum(TITLE[i]), TITLE[i].isalnum()
        print '  startswith(TITLE[i],TITLE[i]) and TITLE[i].startswith(TITLE[i]):'
        print '  ',startswith(TITLE[i],TITLE[i]), TITLE[i].startswith(TITLE[i])
        print '  is_in(TITLE[i],TITLE[i]) and TITLE[i] in TITLE[i]'
        print '  ',is_in(TITLE[i],TITLE[i]), TITLE[i] in TITLE[i]
        print '\n'

Exemplo n.º 21

0

Exibir arquivo

Arquivo: __main__.py Projeto: joeljohnston/mediastruct

    def test(self):
        print("Running Full Test Sequence")
        #the ingest function sorts and moves files by date into the working/media directory
        ingest.ingest(ingestdir, workingdir)

        #the crawl function performs a hash index of all files in the target directories
        workingdirsum = crawl.crawl(True, workingdir, jsondatadir)
        archivedirsum = crawl.crawl(False, archivedir, jsondatadir)

        #the dedupe function combines all hash indexes and analyzes the dataset for duplicates
        data_files = glob.glob(jsondatadir + '/*.json')
        #run the dedupe function
        dedupe.dedupe(data_files, duplicatedir)

        #after the dedupe function has moved duplicaes out, reindex
        workingdirsum = crawl.crawl(True, workingdir, jsondatadir)

        #the archive function pulls from the working/media directory and pools into sized volumes
        archive.archive(archivedir, jsondatadir, workingdir, mediasize)

        #validate that all files in duplicates exist elsewhere before moving to validated
        validate.validate(duplicatedir, workingdir, archivedir, validateddir)

        print("Daily Job Completed Successfully")

Exemplo n.º 22

0

Exibir arquivo

def print_result():
    if request.method == 'POST':
        url = request.form['url']
        input_type = request.form['type']
        output_unit = int(request.form['unit'])

        text = crawl(url)
        if input_type.upper() == 'HTML':
            text = remove_html_tag(text)

        text = remove_non_alnum(text)
        text = sort_ascending(text)
        shares, rest = divide_text(text, output_unit)

        return render_template('main.html', shares=shares, rest=rest)
    else:
        return redirect(url_for('main_page'))

Exemplo n.º 23

0

Exibir arquivo

Arquivo: plab_deployer.py Projeto: pcbing/brunet

    def run(self):
        """  deploys bundled basicnode module to plab nodes using plab_assistant.
         crawls the ring continuously for 48 hours and then gathers logs. """
        plab = plab_assistant.plab_assistant("install", nodes=None, username=self.slice_name, \
            path_to_files=self.path_to_files, ssh_key=self.ssh_key)
        plab.run()
        os.chdir(self.base_path + "/node")
        os.system("sed 's/<Enabled>false/<Enabled>true/' -i node.config." +
                  self.slice_name)
        # start local basicnode without cronolog...
        start_basicnode = "./basicnode node.config." + self.slice_name + " &> log &"
        p = subprocess.Popen(start_basicnode.split(' '))
        self.local_basicnode_pid = p.pid
        time.sleep(60)

        os.chdir(self.base_path)
        start_utc = datetime.datetime.utcnow()
        test_length = datetime.timedelta(hours=48)
        content = open("node/node.config." + self.slice_name).read()
        port_line = re.search("<XmlRpcManager>.*</XmlRpcManager>", content,
                              re.S).group()
        port = int(re.search("\d+", port_line).group())
        while datetime.datetime.utcnow() - start_utc < test_length:
            nodes = crawl.crawl(port)
            consistency, count = crawl.check_results(nodes)
            os.chdir(self.base_path)
            f = open("crawl.csv", "a")
            f.write(
                str(time.asctime()) + ", " + str(consistency) + ", " +
                str(count) + "\n")
            f.close()
            time.sleep(60 * 15)
        # done with the test, start getting logs and cleaning up.
        plab = plab_assistant.plab_assistant("get_logs", nodes=None, username=self.slice_name, \
            path_to_files=self.path_to_files, ssh_key=self.ssh_key)
        plab.run()
        os.system("zip -r9 results.zip logs output.log crawl.csv")
        # Actually not necessary because installation cleans nodes first.
        plab = plab_assistant.plab_assistant("uninstall", nodes=None, username=self.slice_name, \
            path_to_files=self.path_to_files, ssh_key=self.ssh_key)
        plab.run()
        try:
            os.kill(self.local_basicnode_pid, signal.SIGKILL)
        except:
            pass

Exemplo n.º 24

0

Exibir arquivo

Arquivo: plab_deployer.py Projeto: deveck/brunet

  def run(self):
    """  deploys bundled basicnode module to plab nodes using plab_assistant.
         crawls the ring continuously for 48 hours and then gathers logs. """
    plab = plab_assistant.plab_assistant("install", nodes=None, username=self.slice_name, \
        path_to_files=self.path_to_files, ssh_key=self.ssh_key)
    plab.run()
    os.chdir(self.base_path + "/node")
    os.system("sed 's/<Enabled>false/<Enabled>true/' -i node.config." + self.slice_name)
    # start local basicnode without cronolog...
    start_basicnode = "./basicnode node.config." + self.slice_name + " &> log &"
    p = subprocess.Popen(start_basicnode.split(' '))
    self.local_basicnode_pid = p.pid
    time.sleep(60)

    os.chdir(self.base_path)
    start_utc = datetime.datetime.utcnow()
    test_length = datetime.timedelta(hours = 48)
    content = open("node/node.config." + self.slice_name).read()
    port_line = re.search("<XmlRpcManager>.*</XmlRpcManager>", content, re.S).group()
    port =  int(re.search("\d+", port_line).group())
    while datetime.datetime.utcnow() - start_utc < test_length:
      nodes = crawl.crawl(port)
      consistency, count = crawl.check_results(nodes)
      os.chdir(self.base_path)
      f = open("crawl.csv", "a")
      f.write(str(time.asctime()) + ", " + str(consistency) + ", " + str(count) + "\n")
      f.close()
      time.sleep(60 * 15)
    # done with the test, start getting logs and cleaning up.
    plab = plab_assistant.plab_assistant("get_logs", nodes=None, username=self.slice_name, \
        path_to_files=self.path_to_files, ssh_key=self.ssh_key)
    plab.run()
    os.system("zip -r9 results.zip logs output.log crawl.csv")
    # Actually not necessary because installation cleans nodes first.
    plab = plab_assistant.plab_assistant("uninstall", nodes=None, username=self.slice_name, \
        path_to_files=self.path_to_files, ssh_key=self.ssh_key)
    plab.run()
    try:
      os.kill(self.local_basicnode_pid, signal.SIGKILL)
    except:
      pass

Exemplo n.º 25

0

Exibir arquivo

 def crawler(self):
     web_tb = self.webvar1.get()
     web_jd = self.webvar2.get()
     web_am = self.webvar3.get()
     web_all = self.webvar4.get()
     depth = self.dp_var.get()
     goods = self.goods_var.get()
     cr = crawl.crawl()
     if web_all == 1:
         cr.crawler_all(depth, goods)
         messagebox.showinfo(title='result', message='全部爬取完成')
     else:
         if web_jd == 1:
             cr.crawler_jd(depth, goods)
             messagebox.showinfo(title='result', message='jd爬取完成')
         if web_tb == 1:
             cr.crawler_tb(depth, goods)
             messagebox.showinfo(title='result', message='tb爬取完成')
         if web_am == 1:
             cr.crawler_am(depth, goods)
             messagebox.showinfo(title='result', message='TMALL爬取完成')

Exemplo n.º 26

0

Exibir arquivo

Arquivo: test.py Projeto: hseom/hseom_brunet

def main():
  try:
    optlist, args = getopt.getopt(sys.argv[1:], "", ["debug", "port="])

    port = 10000
    debug = False

    for k,v in optlist:
      if k == "--port":
        port = int(v)
      elif k == "--debug":
        debug = True
  except:
    print usage
    return

  nodes = crawl.crawl(port, debug = debug) 
  print crawl.check_results(nodes)

  for task in tasks:
    print task.run(port, nodes, debug)

Exemplo n.º 27

0

Exibir arquivo

Arquivo: time_interval.py Projeto: mushclam/CN_DBServer

def crawling():
    while True:
        print(time.ctime())
        crawl()
        time.sleep(10)

Exemplo n.º 28

0

Exibir arquivo

Arquivo: newsletter.py Projeto: raphaelseo22/newsletter

from apscheduler.schedulers.background import BackgroundScheduler
import pandas as pd
import time
import crawl
import datetime
import mail
from configparser import ConfigParser

cfg = ConfigParser()
cfg.read('config.ini')

fmk_news = crawl.crawl(url=cfg["fmk_news"]["url"],
                       title=cfg["fmk_news"]["title"],
                       time_=cfg["fmk_news"]["time_"],
                       site=cfg["fmk_news"]["site"],
                       category="News",
                       sub="\[[0-9]{1,4}\]")
fmk_fashion = crawl.crawl(url=cfg["fmk_fashion"]["url"],
                          title=cfg["fmk_fashion"]["title"],
                          time_=cfg["fmk_fashion"]["time_"],
                          site=cfg["fmk_fashion"]["site"],
                          category="Fashion",
                          sub="\[[0-9]{1,4}\]")
naver_finance_news = crawl.crawl(url=cfg["naver_finance_news"]["url"],
                                 title=cfg["naver_finance_news"]["title"],
                                 time_=cfg["naver_finance_news"]["time_"],
                                 site=cfg["naver_finance_news"]["site"],
                                 address=cfg["naver_finance_news"]["address"],
                                 category="Finance",
                                 sub="\n[\s\S]*")

Exemplo n.º 29

0

Exibir arquivo

def scheduled_job():
    print('Crawl script is run everyday at 6am [timezone = +7.00]')
    crawl()
    print("Craw: Done")

Exemplo n.º 30

0

Exibir arquivo

Arquivo: bot.py Projeto: gregsabo/only_keep_one

import tweepy

class TwitterAPI:
    """
    Class for accessing the Twitter API.

    Requires API credentials to be available in environment
    variables. These will be set appropriately if the bot was created
    with init.sh included with the heroku-twitterbot-starter
    """
    def __init__(self):
        consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
        consumer_secret = os.environ.get('TWITTER_CONSUMER_SECRET')
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        access_token = os.environ.get('TWITTER_ACCESS_TOKEN')
        access_token_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')
        auth.set_access_token(access_token, access_token_secret)
        self.api = tweepy.API(auth)

    def tweet(self, message):
        """Send a tweet"""
        self.api.update_status(message)

if __name__ == "__main__":
    twitter = TwitterAPI()
    while True:
        tweet = crawl()
        if tweet:
            twitter.tweet(tweet)
        time.sleep(21600 + random.randint(0, 21600)) # wait 6-12 hours

Exemplo n.º 31

0

Exibir arquivo

Arquivo: zhidao_comment.py Projeto: sdz7121211/crowl_proxy

#-*- encoding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import  Queue
import thread
from bs4 import BeautifulSoup
urllis_q = Queue.Queue()
from crawl import crawl
import time 

crawler_task1 = crawl()
crawler_task2 = crawl()


def list_zhidao_url():
    url = '''http://zhidao.baidu.com/search?word=%D0%A1%C0%B1%BD%B7%CA%D6%BB%FA%D4%F5%C3%B4%D1%F9&ie=gbk&site=-1&sites=0&date=0&pn='''
    for i in range(76):
        yield url + str(i*10)


def list_zhidao_url_xiaomi():
    url = '''http://zhidao.baidu.com/search?word=%D0%A1%C3%D7%CA%D6%BB%FA%D4%F5%C3%B4%D1%F9&ie=gbk&site=-1&sites=0&date=0&pn='''
    for i in range(76):
        yield url + str(i*10)


def list_zhidao_url_coolpad():
    url = '''http://zhidao.baidu.com/search?word=%BF%E1%C5%C9%B4%F3%C9%F1%D4%F5%C3%B4%D1%F9&ie=gbk&site=-1&sites=0&date=0&pn='''
    for i in range(73):
        yield url + str(i*10)

Exemplo n.º 32

0

Exibir arquivo

from crawl import crawl
import re

c = crawl()
c.set_root_link('https://www.gog.com/')
c.set_filename('gogData.db')
c.set_link_checker(lambda s: '/game/' in s)
c.set_link_rule(lambda link: re.search(".*www\.gog\.com.*", link) is not None)
c.set_content_rule('name', lambda s: s.find('h1', {
    'class': 'header__title'
}).text)
c.set_content_rule(
    'price', lambda s: str(
        float(
            re.search(
                '<span\ class="_price">(\ |\n)+([0-9]|\.)+(\ |\n)+</span>',
                s.prettify()).group()[21:-7].strip()) * 30))
c.set_content_rule(
    'discount', lambda s: str(
        float(s.find('span', {
            'class': 'buy-price__new'
        }).text) * 30))
c.set_content_rule(
    'feature', lambda s: ", ".join([
        i.text.strip()
        for i in s.findAll('span', {'class': 'game-features__title'})
    ]))

c.run(1)

Exemplo n.º 33

0

Exibir arquivo

Arquivo: order2eco.py Projeto: mgieseking/FoodCoopOrderer

import sys
import webbrowser
import re
import time
import crawl

SHOP_URL = 'http://www.ecocion-shop.de/web/main.php/shop/addCart?live_pid='
DELAY = 3

if __name__ == '__main__':
    with open(sys.argv[1], 'r') as input_file:
        fresh_item_list = ['empty row']
        crawl.crawl(fresh_item_list, fresh_item_list, crawl.FRESH_CATEGORIES_LIST)
        if 'frisch' not in input_file.name:
            dry_item_list = ['empty row']
            crawl.crawl(fresh_item_list, dry_item_list, crawl.ALL_ITEMS)

        with open(crawl.DRY_ARTICLES_CSV, 'r') as dry_file:  # for comparison to detect price changes
            old_dry_item_list = dry_file.readlines()

        # find first occurrence of 'Nummer    Menge   Name'
        order = input_file.read().split('Nummer    Menge   Name', 1)[1]

        # split into lines
        lines = order.split('\n')

        ask_to_proceed = False
        id_count_name_list = []
        for line in lines:
            splitted_line = re.split(r' +', line)  # e.g. ['', '392459', '2', 'Bioland', 'Joghurt', 'Natur', '1L']
            if len(splitted_line) > 3:

Exemplo n.º 34

0

Exibir arquivo

Arquivo: main.py Projeto: Anaatti/Steam-crawler

     create_dirs("temp/")
     #Logging
     sys.stdout = Logger()
     #Check Python version
     print("Using Python version " +\
           str(sys.version_info.major) + "." +\
           str(sys.version_info.minor) + "." +\
           str(sys.version_info.micro) + " " +\
           sys.version_info.releaselevel + " " +\
           str(int(round(log(sys.maxint * 2 + 2, 2)))) + "bit")
     if sys.version_info.major != 2:
         print("Not supported; use Python 2")
     elif 0:
         print("")
         #start
         crawler = crawl()
         crawler.start()
         try:
             while(True): time.sleep(2)
         except KeyboardInterrupt:
             try:
                 print("    -> Closing program")
                 time.sleep(5)
                 print("    -> Analyzing")
             except KeyboardInterrupt:
                 print("    -> Skipping analysis")
                 crawler.quit_analyze = False
             crawler.quit = True
             crawler.join()
     print("I don't think there is a need to run another Steam crawler")
 raw_input("Finished")

Exemplo n.º 35

0

Exibir arquivo

Arquivo: process-yaca.py Projeto: versusvoid/RuLanAdCor

            url = urllib.parse.urlparse(line.strip().split(',', 1)[0])
            assert url.scheme.startswith('http') 
            str_url = '{}://{}/'.format(url.scheme, url.netloc)
            if str_url not in unique_urls:
                unique_urls.add(str_url)
                queue.put(str_url)


os.makedirs('data/output', exist_ok=True)

links_queue = queue.Queue()
NUM_THREADS=1

threads = []
for i in range(NUM_THREADS):
    threads.append(threading.Thread(target=run, args=(i, links_queue)))
    threads[-1].start()


if len(sys.argv) < 2:
    crawl.crawl(links_queue)
else:
    read_links_from_file(sys.argv[1], links_queue)

for i in range(NUM_THREADS):
    links_queue.put(None)

for thread in threads:
    thread.join()

Exemplo n.º 36

0

Exibir arquivo

import crawl

soup = crawl.crawl("https://www.ii.uni.wroc.pl/~marcinm/", 13, crawl.action)
soup2 = crawl.crawl("https://pl.wikipedia.org/wiki/Python", 3, crawl.action)
soup3 = crawl.crawl('https://www.ii.uni.wroc.pl/~marcinm/dyd/python/', 3,
                    crawl.action)

print("\n\nII.UNI.WROC.PL/~MARCINM\n\n")
for i in soup:
    print(i)
print("\n\nWIKIPEDIA\n\n")
for i in soup2:
    print(i)
print("\n\nDYD/PYTHON/\n\n")
for i in soup3:
    print(i)
"""
g1, g2- generator

def sum (g1, g2):
    for i in g1:
        yield i
    for i in g2:
        yield i

def sum (g1, g2):
    yield from g1
    yield from g2

def f():
    yield <...>

Exemplo n.º 37

0

Exibir arquivo

Arquivo: test_crawl.py Projeto: luyaochi/mycrawlerlib

	def setUp(self):
		self.crawl = crawl(path)

Exemplo n.º 38

0

Exibir arquivo

Arquivo: main.py Projeto: TaikorRoy/scrapy_workspace2

# -*- coding: utf-8 -*-
__author__ = 'Taikor'


from crawl import crawl
from output_excel import output_excel
from copyfile import copyfile

if __name__ == "__main__":
    crawl()
    output_excel()
    copyfile(r'C:\xampp\htdocs\cosmetic_price_check\data\jd\MianBuDiShuang')
    print("Crawler Task Completed !")

Exemplo n.º 39

0

Exibir arquivo

Arquivo: extract.py Projeto: ashfame/indianrail-db

def crawl_train_schedules():
    debug("Crawling Train schedules")
    count = db.query(func.count(TempTrain.id)).scalar()
    for i, row in enumerate(db.query(TempTrain.url)):
        debug("Crawing %s of %s. Remaining %s" % (i, count, count-i))
        crawl(row.url)

Exemplo n.º 40

0

Exibir arquivo

Arquivo: web_interface.py Projeto: charles4/test-score-crawler-v2

def route_generate():

	crawl.crawl()

	return redirect(url_for("route_download"))

Exemplo n.º 41

0

Exibir arquivo

Arquivo: 0310707.py Projeto: redman0226/ptt_crawl

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 13 10:17:10 2018

@author: redman
"""
import requests
from bs4 import BeautifulSoup
import argparse
import crawl
import push
import popular
import findword
import sys

if __name__ == '__main__':

    if (sys.argv[1] == 'crawl'):
        #print('boo')
        crawl.crawl()
    if (sys.argv[1] == 'push'):
        #print('good')
        push.countpush(sys.argv[2], sys.argv[3])
    if (sys.argv[1] == 'popular'):
        popular.countpopular(sys.argv[2], sys.argv[3])
    if (sys.argv[1] == 'findword'):
        #print('hello')
        #print(sys.argv[2])
        findword.word(sys.argv[3], sys.argv[4], sys.argv[2])

Exemplo n.º 42

0

Exibir arquivo

def server():
    condition = request.args.get('keyword')
    return crawl(condition)