Exemplo n.º 1
0
 def __init__(self, mem_size: int = 10000):
     """
             The following is an abstraction for a bank of values
             such as valA, which will be used during each cycle.
             It's set up as an object to avoid circular import.
     """
     self.ValBank = ValBank()
     """
     The following are functional units like memory,
     registers, or flags
     """
     self.Memory = Memory(mem_size)
     self.RegisterBank = RegisterBank()
     self.ZF = CCFlag("ZF")  # zero flag
     self.OF = CCFlag("OF")  # overflow flag
     self.SF = CCFlag("SF")  # sign flag
     self.ErrorFlag = StateFlag("Error Flag", error_lib)
     self.StateFlag = StateFlag("State Flag", state_lib)
     self.ALU = ALU(self.ValBank, self.StateFlag, self.ErrorFlag, self.SF, self.OF, self.ZF)
     """
     The following are functional abstractions of operations
     that the processor performs
     """
     self.Fetcher = Fetcher(self.ValBank, self.RegisterBank, self.Memory, self.StateFlag, self.ErrorFlag)
     self.Decoder = Decoder(self.ValBank, self.RegisterBank, self.Memory)
     self.Executor = Executor(self.ValBank, self.ALU, self.OF, self.ZF, self.SF)
     self.Memorizer = Memorizer(self.ValBank, self.Memory)
     self.RegWriter = RegWriter(self.RegisterBank, self.ValBank)
     self.PCUpdater = PCUpdater(self.RegisterBank, self.ValBank)
Exemplo n.º 2
0
    def test_transferSourceToSink_actualData(self):

        try:
            sourcedbconnectionstring = 'sqlite:///../../ContentData/eb99f209f9c34ba192f6e695aeb37e4f.sqlite3'

            sourceTableName = 'Content_contentnode'

            fetchObj = Fetcher()
            fetchObj.transferSourceToSink(sourcedbconnectionstring,
                                          sourceTableName,
                                          self.sinkDbConnectionString)
            staging_engine = create_engine(self.sinkDbConnectionString)
            staging_metadata = MetaData(bind=staging_engine)
            Content_Node = Table('content_contentnode',
                                 staging_metadata,
                                 autoload=True,
                                 autoload_with=staging_engine)
            session = Session(staging_engine)
            self.assertTrue(session.query(Content_Node).count() > 0)

        except Exception as e:
            print(e)
            self.assertFalse(
                True,
                'There was an exception while loading from source to sink')
Exemplo n.º 3
0
class ProxyPool:
    def __init__(self):
        self.fetcher = Fetcher()
        self.proxies_pool = set()
        self.init_pool()
        gevent.spawn(self.fetch_forever)
        gevent.spawn(self.test_forever)

    def init_pool(self):
        self.proxies_pool.update(self.fetcher.fetch())

    def test_forever(self):
        pool = Pool(POOL_SIZE)
        while True:
            proxies_pool = list(self.proxies_pool)
            print 'Start testing.'
            pool.map(self.test, proxies_pool)
            pool.join()
            print '{0} proxies could be used.'.format(len(self.proxies_pool))
            time.sleep(TEST_INTERVAL)

    def test(self, proxy):
        if not tester(proxy):
            try:
                self.proxies_pool.remove(proxy)
            except KeyError:
                pass

    def fetch_forever(self):
        while True:
            time.sleep(FETCH_INTERVAL)
            self.proxies_pool.update(self.fetcher.fetch())
Exemplo n.º 4
0
Arquivo: Robot.py Projeto: tvs/creepy
 def _get_robots(uri, user_agent, timeout):
     rtxt = None
     uri = urlparse.urljoin(uri, '/robots.txt')
     io = Fetcher(uri)
     response = io.get_response()        
     if response and response.headers.type == 'text/plain' and urlparse.urlparse(response.url).path == '/robots.txt': # To prevent redirects to HTML page
         rtxt = io.get_content()
     return rtxt
Exemplo n.º 5
0
class Processor:
    """
    This represents the processor itself. Most work and operations
    will be outsourced to other objects referenced by the processor.
    The processor holds the values of valP, valE, valA, valB, valC,
    and rA, rB.
    """

    def __init__(self, mem_size: int = 10000):
        """
                The following is an abstraction for a bank of values
                such as valA, which will be used during each cycle.
                It's set up as an object to avoid circular import.
        """
        self.ValBank = ValBank()
        """
        The following are functional units like memory,
        registers, or flags
        """
        self.Memory = Memory(mem_size)
        self.RegisterBank = RegisterBank()
        self.ZF = CCFlag("ZF")  # zero flag
        self.OF = CCFlag("OF")  # overflow flag
        self.SF = CCFlag("SF")  # sign flag
        self.ErrorFlag = StateFlag("Error Flag", error_lib)
        self.StateFlag = StateFlag("State Flag", state_lib)
        self.ALU = ALU(self.ValBank, self.StateFlag, self.ErrorFlag, self.SF, self.OF, self.ZF)
        """
        The following are functional abstractions of operations
        that the processor performs
        """
        self.Fetcher = Fetcher(self.ValBank, self.RegisterBank, self.Memory, self.StateFlag, self.ErrorFlag)
        self.Decoder = Decoder(self.ValBank, self.RegisterBank, self.Memory)
        self.Executor = Executor(self.ValBank, self.ALU, self.OF, self.ZF, self.SF)
        self.Memorizer = Memorizer(self.ValBank, self.Memory)
        self.RegWriter = RegWriter(self.RegisterBank, self.ValBank)
        self.PCUpdater = PCUpdater(self.RegisterBank, self.ValBank)

    def run(self):
        """
        This is the only necessary public method for the processor.
        Currently the way to operate this is by manually loading values
        into the memory using the place_instruction method, then calling
        the 'run' method for the processor. Afterwards calling print
        on the memory object will reveal the finish state of the processor.
        """
        while self.StateFlag.State == 0:
            self.Fetcher.fetch()
            self.Decoder.decode()
            self.Executor.execute()
            self.Memorizer.memory_write()
            self.RegWriter.write_back()
            self.PCUpdater.update_pc()
Exemplo n.º 6
0
def main():
    from Fetcher import Fetcher
    from post import Post

    link = r'http://www.geeksforgeeks.org/find-xor-of-two-number-without-using-xor-operator/'
    fetcher = Fetcher(logging.DEBUG)
    host, response = fetcher.fetch(link.strip())

    post = Post(link=link, host_name=host, raw_page=response)

    content = GeeksForGeeksParser.parse(post)
    print "===== back to main method ====="
    print content
Exemplo n.º 7
0
def main():

    # create share queue
    SHARE_Q = queue.Queue()

    # Start Fetcher fetch new metadata, and put into share queue
    fetcher = Fetcher(5, SHARE_Q)
    fetcher.start()

    # Start a downloader, get one metadata from queue,
    # then download open data and archive into ckan
    downloader = Downloader(SHARE_Q)
    downloader.start()
Exemplo n.º 8
0
def config_login():
    config = ConfigParser.ConfigParser()
    config.read("config.ini")
    email = config.get("account1", "email")
    password = config.get("account1", "password")
    seed=config.get("seed_url", "seed")
    db=config.get('database', 'db')
    collection_info=config.get('collection', 'table_info')
    collection_relation=config.get('collection', 'table_relation')
    method=Fetcher()
    method.login(email, password)
    url=seed
    return url, method, db, collection_info, collection_relation
Exemplo n.º 9
0
    def test_cleansink(self):
        try:
            sourcedbconnectionstring = 'sqlite:///../../ContentData/eb99f209f9c34ba192f6e695aeb37e4f.sqlite3'

            sourceTableName = 'Content_contentnode'

            fetchObj = Fetcher()
            fetchObj.cleansink(sourcedbconnectionstring, sourceTableName,
                               self.sinkDbConnectionString)
            self.assertTrue(True)
        except Exception as e:
            print(e)
            self.assertFalse(True,
                             'There was an exception while cleaning the sink')
Exemplo n.º 10
0
    def crawl(self, tid):
        """Crawl given URL"""
        while True:
            try:
                if self.threshold > 0 and self._pagesstored + 1 > self.threshold:
                    print "### Number of URLs crawled:", self._pagesstored
                    self.watcher.kill()
                    break

                url = self.frontier.get()
                if self.verbose:
                    print "  Crawler #%d: %s" % (tid, url)

                robot = self.robotstorage.get_robot(url)
                if robot.is_allowed(url):
                    # Delay processing
                    d_time = robot.delay_remaining(time.time())
                    if d_time > 0:
                        if self.verbose > 1:
                            print "\n###*** Delaying for %f seconds" % d_time
                        time.sleep(d_time)

                    # Update the last request time
                    robot.update_last_request(time.time())

                    # Download the Page
                    page = Fetcher(url, verbose=self.verbose)
                    doc = page.get_content()
                    if doc:
                        self.pagestorage.store(url, doc)
                        self._lock.acquire()
                        try:
                            self._pagesstored = self._pagesstored + 1
                        finally:
                            self._lock.release()

                        # Parse Page for Links
                        p = Parser(url, doc)
                        links = p.get_links()
                        if self.verbose > 1:
                            print "    # links on %s: %d" % (url, len(links))
                        for link in links:
                            self.queue_url(link)
                else:
                    if self.verbose > 1:
                        print "*** URL not allowed:", url
            finally:
                self.frontier.task_done()
Exemplo n.º 11
0
def main():


    SHARE_Q = queue.Queue()


    # r = requests.get(const.METADATA_URL_PREFIX)
    # x = json.loads(r.text)

    # dataid = x['result']
    # count = 1
    conn = DBUtil.createConnection()
    # for s in dataid:
    #     DBUtil.insertDataSetID(conn, s)
    #     logger.debug(str(count) + " _ " + s)
    #     count = count +1

    count = 1
    dataid2 = DBUtil.getNotProcessedDataSet(conn)
    # for s in dataid2:
    # logger.debug(str(count) + " _ " + s.package_name)
    # logger.debug(str(count) + " _ " + s)
    # count = count +1

    DBUtil.closeConnection(conn)

    fetchers = []
    if (config.fetcher_num > 0):
        for i in range(config.fetcher_num):
            fetchers.append(Fetcher(SHARE_Q, i, dataid2))
            fetchers[i].start()
Exemplo n.º 12
0
def search(keyword):
    # a quick hack to clear out the value or attribute action in the form
    action_name = ' '

    form = KeywordForm(csrf_enabled=True)
    onlyFirstPage = False

    searchResult = None

    if app.config['DATASOURCE'] == 'local':
        dictDb = DictionaryDB(limit=onlyFirstPage)
        searchResult = dictDb.search(keyword)
    else:
        # send request to j-doradic and collect word definitions here
        fetcher = Fetcher(onlyFirstPage, keyword)
        searchResult = fetcher.performSearch()

    return render_template('search_result.html',
                           form=form,
                           action_name=action_name,
                           onlyFirstPage=onlyFirstPage,
                           keyword=keyword,
                           searchResult=searchResult)
Exemplo n.º 13
0
class ProxyPool:
    def __init__(self):
        self.fetcher = Fetcher()
        self.proxies_pool = set()
        self.init_pool()
        gevent.spawn(self.fetch_forever)
        gevent.spawn(self.test_forever)

    def init_pool(self):
        try:
            with open('proxies.json') as f:
                self.proxies_pool = set(json.load(f))
        except IOError:
            self.proxies_pool.update(self.fetcher.fetch())

    def test_forever(self):
        pool = Pool(POOL_SIZE)
        while True:
            proxies_pool = list(self.proxies_pool)
            print 'Start testing.'
            pool.map(self.test, proxies_pool)
            pool.join()
            with open('proxies.json', 'w+') as f:
                json.dump(proxies_pool, f)
            time.sleep(TEST_INTERVAL)

    def test(self, proxy):
        if not tester(proxy):
            try:
                self.proxies_pool.remove(proxy)
            except KeyError:
                pass

    def fetch_forever(self):
        while True:
            time.sleep(FETCH_INTERVAL)
            self.proxies_pool.update(self.fetcher.fetch())
Exemplo n.º 14
0
def main():
    if len(sys.argv) < 2:

        SHARE_Q = queue.Queue()

        # ---- Even though first_run.py restart, all data id should fetch ONLY ONE time -----
        conn = DBUtil.createConnection()
        if DBUtil.isDatasetEmpty(conn) is True:
            r = requests.get(const.METADATA_URL_PREFIX)
            x = json.loads(r.text)

            dataset = x['result']
            count = 1
            logger.debug("Current data id size is " + str(len(dataset)))
            for id in dataset:
                DBUtil.insertDataSetID(conn, id)
                logger.debug(str(count) + " _ " + id)
                count = count + 1
            # set a finished flag to ensure wouldn't redo
            DBUtil.insertDataSetDoneFlag(conn)
        # ---- Even though first_run.py restart, all data id should fetch ONLY ONE time -----

        count = 1
        dataid = DBUtil.getNotProcessedDataSet(conn)
        for s in dataid:
            # logger.debug(str(count) + " _ " + s.package_name)
            # logger.debug(str(count) + " _ " + s)
            count = count + 1

        DBUtil.closeConnection(conn)

        fetchers = []
        if config.fetcher_num > 0:
            for i in range(config.fetcher_num):
                fetchers.append(Fetcher(SHARE_Q, i, dataid))
                fetchers[i].start()

        # Block at main(), Downloader will start after all fetchers finish
        '''
        if config.fetcher_num > 0 :
            for i in range(config.fetcher_num):
                fetchers[i].join()
        '''

        downloaders = []
        if config.downloader_num > 0:
            for i in range(config.downloader_num):
                downloaders.append(Downloader(SHARE_Q))
                downloaders[i].start()
Exemplo n.º 15
0
    def LoadSourceToStaging(self, filePath, sinkDbConnectionString,
                            sourceTableNames, extension):
        try:
            objList = []
            dbFiles = self.getSortesListofFiles(filePath)
            sourceTableNameList = sourceTableNames.split(",")

            # logging.info('sourceTableNameList:'+ str(sourceTableNameList))
            # validate data fetched from configuration file
            if (not filePath):
                raise ValueError('Source File path is empty!')
            elif (not sinkDbConnectionString):
                raise ValueError('Sink DB connection string is empty!')
            elif (not sourceTableNames):
                raise ValueError('Source Table Names is empty!')
            elif (not dbFiles or len(dbFiles) == 0):
                raise ValueError('No Db files present!')

            # perform transfer of data from source db to sink db
            for sourceTableName in sourceTableNameList:
                obj = Fetcher()
                isCleaned = 0
                for dbfile in dbFiles:
                    if str(dbfile) in skippedDBList:
                        continue
                    dbfile = filePath + dbfile
                    sourcedbconnectionstring = 'sqlite:///' + dbfile

                    if (isCleaned == 0):
                        obj.cleansink(sourcedbconnectionstring,
                                      sourceTableName, sinkDbConnectionString)
                        isCleaned = 1
                    # obj.transferSourceToSink(sourcedbconnectionstring, sourceTableName, sinkDbConnectionString, dbfile)
                    objArgs = {
                        'sourceDbString': sourcedbconnectionstring,
                        'tableName': sourceTableName,
                        'sinkDbSring': sinkDbConnectionString,
                        'dbName': dbfile
                    }
                    objList.append(objArgs)
            obj = Fetcher()
            p = Pool(5)
            p.map(obj.transferSourceToSink, objList)
            p.close()
            p.join()

        except Exception as e:
            logging.basicConfig(filename='Fetcher.log', level=logging.ERROR)
            logging.error('There is an exception in the code FetcherPlumber!')
            logging.error(e)
            logging.error(traceback.format_exc())
            raise
    def __init__(self):
        self.queues = []
        self.layers = []
        self.layers.append([])
        self.layers.append([])
        self.layers.append([])
        self.layers.append([])
        self.layers.append([])

        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())

        self.fetcher = Fetcher()
        self.visited_urls = set()
        self.urls_already_in_layers = set()
Exemplo n.º 17
0
def main():
    # create share queue
    SHARE_Q = queue.Queue()

    # According to configuration, start a thread for fetch
    # history data since last fetch or not.
    if (config.FetchHistory):
        historyFetcher = Fetcher(SHARE_Q)
        historyFetcher.start()

    # Start Fetcher fetch new metadata, and put into share queue
    updateFetcher = Fetcher(SHARE_Q, config.update_interval_sec)
    updateFetcher.start()

    # Start some downloaders, and each get one metadata from queue,
    # then download open data.
    # Use multiple downloaders to increase download speed.
    downloaders = []
    for i in range(config.downloader_num):
        downloaders.append(Downloader(SHARE_Q))
        downloaders[i].start()
Exemplo n.º 18
0
    def LoadSourceToStaging(self, filePath,sinkDbConnectionString,sourceTableNames,extension):
        try:

            dbFiles = self.fetchFilesFromDirectory(filePath, extension)
            sourceTableNameList = sourceTableNames.split(",")

            # validate data fetched from configuration file
            if (not filePath):
                raise ValueError('Source File path is empty!')
            elif (not sinkDbConnectionString):
                raise ValueError('Sink DB connection string is empty!')
            elif (not sourceTableNames):
                raise ValueError('Source Table Names is empty!')
            elif (not dbFiles or len(dbFiles) == 0):
                raise ValueError('No Db files present!')

            # perform transfer of data from source db to sink db
            for sourceTableName in sourceTableNameList:
                obj = Fetcher()
                isCleaned = 0

                for dbfile in dbFiles:

                    dbfile=filePath+dbfile
                    sourcedbconnectionstring = 'sqlite:///' + dbfile

                    if (isCleaned == 0):
                        obj.cleansink(sourcedbconnectionstring, sourceTableName, sinkDbConnectionString)
                        isCleaned = 1

                    obj.transferSourceToSink(sourcedbconnectionstring, sourceTableName, sinkDbConnectionString)

        except Exception as e:
            logging.basicConfig(filename='Fetcher.log', level=logging.ERROR)
            logging.error('There is an exception in the code FetcherPlumber!')
            logging.error(e)
            logging.error(traceback.format_exc())
            raise
Exemplo n.º 19
0
    def crawl(self):
        '''
        main function in the crawling process 
        q <- starting page
        while q not empty:
           url <- q.get()
           if url is new and suitable:
              page <- fetch(url)   
              q.put(urls found in page)
           else:
              nothing
        
        '''
        q = Queue()
        q.put((self.root,0))  #0 stands for the level 
        
        while not q.empty():
            this_url, depth = q.get()
#            this_url_host = urlparse.urlparse(this_url)[1]
            
            #check if it obeys robot courtesy 
            try:
#                self.rp.set_url('http://'+this_url_host+'/robots.txt')
#                self.rp.read()
                #if not allowed by robots.txt, discard it 
                if not self.rp.can_fetch('PyCrawler', this_url):
                    #print this_url+'not allowed by robot\n'
                    continue
            except:
                pass
            
            #discard links over depth
            if depth > self.depth_limit:   
                continue
            
            #check if the page is new 
            if this_url in self.url_seen:
                continue
            else:
                self.url_seen.add(this_url)
            
            #if this url doesn't satisfy the filter condition
            filter_reg = self.confine_prefix;
            if not self._url_filter(this_url, filter_reg):
                continue
            
            #the url is new 
            
            do_not_follow = self.pre_visit_filters#[f for f in self.pre_visit_filters if not f(this_url)]
            
            if depth == 0 and [] != do_not_follow:
                print >> sys.stderr, "error, starting url is rejected"
                
            #if no filters failed
            if [] == do_not_follow:
                try:
                    self.visited_links.add(this_url)
                    #self.url_seen.add(this_url)
                    self.num_followed += 1
                    page = Fetcher(this_url,self.page_count)
                    page.fetch()  #fetch the page, save the file into local 
                    
                    self.page_count += 1
                    print self.page_count    #test 
                    for link_url in page.out_links():
                        if link_url not in self.url_seen:   #if the url is new 
                            q.put((link_url,depth+1))
                            #self.url_seen.add(link_url)
                            
                except:
                    print >>sys.stderr, "ERROR: Can't process url '%s' " % this_url
class ContextGraph(object):

    def __init__(self):
        self.queues = []
        self.layers = []
        self.layers.append([])
        self.layers.append([])
        self.layers.append([])
        self.layers.append([])
        self.layers.append([])

        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())

        self.fetcher = Fetcher()
        self.visited_urls = set()
        self.urls_already_in_layers = set()
    def addToQueue(self, queue, page, score):
        self.queues[queue].putPage(page, score)

    def getNextQueuedPage(self):
    	for queue in self.queues:
    		if not queue.isEmpty():
    			return queue.getPage()
    	else:
    		print "No urls in queue"
    		return 'Empty Queue'

    """Pega pagina, classifica bota na queue correta"""
    def classify(self, page_dict):
        layer = self.classifier.predictQueue(page_dict['text'])
        layer, score = self.classifier.predictQueue(page_dict['text'])
        #ele vai adicionar todos os links dessa pagina na queue classificada, n eh soh um addToQueue, sao varios
        for link in page_dict['links']:
            self.addToQueue(layer, link, score)
        return layer

    def store(self, page_dict, layer, page_id):
        crawl_directory = 'crawl/'
        path = crawl_directory+str(layer)
        self.check_dir(path)
        path = path+'/'+urlparse(page_dict['url']).netloc
        self.check_dir(path)
        path = path+'/'+str(page_id)
        page_store = open(path, 'w')
        page_store.write(page_dict['url']+'\n')
        page_store.write(page_dict['text']+'\n')

    def check_dir(self, path):
        if not os.path.exists(path):
            os.mkdir(path)

    def addSeed(self, _seed):
        self.layers[0].append(_seed)

    def addToLayer(self, layer, page):
        if not page.url in self.urls_already_in_layers:
            self.urls_already_in_layers.add(page.url)
            self.layers[layer].append(page)
#        +'/'+page.url.replace("s", " ")
#        file = open("layers/"+str(layer)+'/'+page.url.replace("/", "_"),"wb")
#        pickle.dump(page, file)

    """ layer = o layer da page passada """
    def constructGraph(self, seed, layer, num_backlinks):
        print "constructing layer:", layer, " of seed:", seed.url
        list_urls = self.getBackLinks(seed.url, num_backlinks, layer)
        if layer<=3:
            for url in list_urls:
                page = Page(url)      #points_to = seed
                print 'fetching backlink page: ', url
                page_dict = self.fetcher.getPage(url)
                page.setText(page_dict['text'])
                self.addToLayer(layer+1, page)

    def initGraph(self):
        """1 de cada"""
#        for seed in self.layers[0]:
#            self.constructGraph(seed, 0, 300)
#        for page in self.layers[1]:
#            self.constructGraph(page, 1, 1)
#        for page in self.layers[2]:
#            self.constructGraph(page, 2, 1)
#        for page in self.layers[3]:
#            self.constructGraph(page, 3, 1)
#        for page in self.layers[4]:
#            self.constructGraph(page, 4, 1)
        """aleatorio"""
        num_backlinks = 10
        num_each = 10
        for seed in self.layers[0]:
            self.constructGraph(seed, 0, num_backlinks)
        already_used= set()
        for num in range(0, num_backlinks):
            i = random.randint(0, len(self.layers[1])-1)
            while i in already_used:
                i = random.randint(0, len(self.layers[1])-1)
            already_used.add(i)
            self.constructGraph(self.layers[1][i], 1, num_each)
        already_used= set()
        for num in range(0, num_backlinks):
            i = random.randint(0, len(self.layers[2])-1)
            while i in already_used:
                i = random.randint(0, len(self.layers[2])-1)
            already_used.add(i)
            self.constructGraph(self.layers[2][i], 2, num_each)
        already_used= set()
        for num in range(0, num_backlinks):
            i = random.randint(0, len(self.layers[3])-1)
            while i in already_used:
                i = random.randint(0, len(self.layers[3])-1)
            already_used.add(i)
            self.constructGraph(self.layers[3][i], 3, num_each)
#        already_used= set()
#        for num in range(0, num_backlinks):
#            i = random.randint(0, len(self.layers[4])-1)
#            while i in already_used:
#                i = random.randint(0, len(self.layers[4])-1)
#            already_used.add(i)
#            self.constructGraph(self.layers[4][i], 4, num_each)

    def readLayerZeroFile(self, layer0_txt):
        layer0_txt = open(layer0_txt, 'r')
        for line in layer0_txt:
            line = line.replace("\n", "")
            if not line=="" and not line==" ":
                self.addSeed(Page(line.replace("\n", ""), True))

    def readSeedsFile(self, seeds_txt):
        seeds_txt = open(seeds_txt, 'r')
        for line in seeds_txt:
            line = line.replace("\n", "")
            if not line=="":
                self.addToQueue(5, line.replace("\n", ""), 1) #score = 1

    def getBackLinks(self, page_url, num, layer):
        print 'Getting ', page_url, 'backlinks...'
        yahoo = Yahoo(page_url, num)
        return yahoo.lista
#        lista = []
#        for pos in range(0, num):
#            url = 'www.backlink'+str(pos)+'.com?layer='+str(layer)
#            lista.append(url)
#        return lista


    def trainClassifier(self):
        #gera lista de listas
        list_layers = []
        for layer in self.layers:
            current_layer = []
            for page in layer:
                current_layer.append(page.text)
            list_layers.append(current_layer)
        self.classifier = QueueClassifier(list_layers)
        print self.classifier.predictQueue("ei ou are one")

    def fetchSeeds(self):
        for page in self.layers[0]:
            page_dict = self.fetcher.getPage(page.url)
            page.setText(page_dict['text'])

    def printLayers(self):
        for i in range(0,5):
            print "Layer", i, ' - Size: ', len(self.layers[i])
            for page in self.layers[i]:
                if not page.isSeed():
                    pass
#                    print "[",i,"] -", page.url, "| points_to:", page.points_to.url, "| Seed?= ", page.isSeed()
                else:
                    pass
Exemplo n.º 21
0
 def __init__(self):
     self.fetcher = Fetcher()
     self.proxies_pool = set()
     self.init_pool()
     gevent.spawn(self.fetch_forever)
     gevent.spawn(self.test_forever)
Exemplo n.º 22
0
def work(self, pro_type):
    conf_file = "DBConfig.ini"
    src_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    des_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'data_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'data_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'data_db', 'port'))
        }

    from CommonLib.Logging import Logging
    log = Logging(name=pro_type)

    log.info("Process begin")

    pro_type = pro_type.lower()
    queue_name = pro_type

    module_name = pro_type.capitalize() + "Handler"
    handler = ClassFactory.getClassInst(module_name,
                                        package_name="Parser",
                                        pinyin=pro_type.lower())

    # nb_module_name = pro_type.capitalize() +"Nb" + "Handler"
    # nb_handler  = ClassFactory.getClassInst(module_name, package_name = "Parser", pinyin=pro_type.lower())

    normal_table = pro_type + "_data"
    err_table = normal_table + "_error"

    # db_inst = DBManager.getInstance(des_db_dict["type"], normal_table, host = des_db_dict["host"], port = des_db_dict["port"]) #ssdb 存 解析后数据

    # kfk_inst = DBManager.getInstance("kafka", "qyxx_html", host = "spider7", port = 9092)
    # debug_normal_table =  "new_"+pro_type.lower()+"_data"
    # db_debug = DBManager.getInstance("mongo",debug_normal_table, host = "spider7", port = 27037)# mongo, 数据存本地,用于调试

    # fetch=Fetcher(queue_name.lower() ,"qyxx") enable this if not debug

    fetch = Fetcher(queue_name,
                    "qyxx",
                    get_db_dict=src_db_dict,
                    save_db_dict=des_db_dict)  # debug

    while True:
        try:
            # source_dict = fetch.hget()
            source_dict = fetch.get()

            if source_dict:
                # 拷贝 种子信息到解析后的数据里面
                if source_dict.has_key("bbd_seed"):
                    seed_dict = {"bbd_seed": source_dict["bbd_seed"]}
                if source_dict.has_key("BBD_SEED"):
                    seed_dict = {"bbd_seed": source_dict["BBD_SEED"]}
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ING,
                                    seed_dict)
                log.info(log_info)
                # fetch.backup() # 避免进程异常退出引起的数据丢失
                res_dict = UniField.cloneNeedColumns(source_dict)
                log.info("start to a new seed %s", source_dict)

                #debug
                # db_inst.changeTable("new_"+pro_type.lower())
                # db_inst.save(source_dict);
                # rowkey=source_dict["rowkey"]
                # db_inst.hset(rowkey,source_dict)
                # db_inst.changeTable("new_"+pro_type.lower()+"_processed")
                # db_inst.save(source_dict)
                res_dict = handler.parse(source_dict, res_dict)

                if res_dict["status"] == 0:
                    db_inst.changeTable(normal_table)
                    res_dict = UniField.unifyParseResult(res_dict)

                    #for debug
                    db_debug.save(res_dict)

                    # db_inst.save(res_dict)
                    # kfk_inst.save(source_dict)
                    # print "kfk size:",kfk_inst.size()
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC,
                                        seed_dict)
                    log.info(log_info)
                else:
                    db_inst.changeTable(err_table)
                    res_dict["html"] = source_dict

                    # db_inst.save(res_dict)
                    db_debug.save(res_dict)

                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO,
                                        seed_dict)
                    log.info(log_info)
            else:
                log.info(u"解析%s队列为空, 等待10秒重试", pro_type)
                time.sleep(10)
        except Exception as e:
            print str(e)
            raise Exception(e)
Exemplo n.º 23
0
    def __go(self, target, category=0):

        if category == 0:
            self.type_0_lock.acquire()
        elif category == 1:
            self.type_1_lock.acquire()

        try:

            sqlUtil = MySqlDAL()

            LogRec.get_logger(Config.INFLOGGER).info(u"开始抓取文章链接。。。")
            crawler = WebCrawler()

            # 得到连接
            crawler.set_target(target)
            tmp_link_list = crawler.get_page_link()

            # 过滤数据库中已有的信息
            target_tuple = list()

            for row in tmp_link_list:
                tmp_link = row[0].strip()
                # 过滤blog情况
                if tmp_link.startswith(r"http://blog.sina.com.cn"):
                    tmp_index = tmp_link.rfind("?")
                    if tmp_index != -1:
                        tmp_link = tmp_link[:tmp_index]

                tmp_sql = "SELECT * FROM " + self.table_name + " WHERE target='" + tmp_link + "'"
                if not sqlUtil.get_dimensions_one_row(tmp_sql):
                    target_tuple.append(row)
                # else:
                #     print "already"

            # for row in target_tuple:
            #     for item in row:
            #         print item,
            #     print ""

            LogRec.get_logger(Config.INFLOGGER).info(u"获取最新链接列表,准备开始捕获文章内容")

            # 多线程得到内容
            f = Fetcher(threads=10)

            for row in target_tuple:
                if row[0].startswith(r"http://finance.sina.com.cn"):
                    f.push(
                        {"target": row[0].strip(), "type": 0, "title": row[1].strip(), "publish_time": row[2].strip()}
                    )
                elif row[0].startswith(r"http://licaishi.sina.com.cn"):
                    f.push(
                        {"target": row[0].strip(), "type": 1, "title": row[1].strip(), "publish_time": row[2].strip()}
                    )
                elif row[0].startswith(r"http://blog.sina.com.cn"):
                    tmp_index = row[0].rfind("?")
                    if tmp_index != -1:
                        tmp_target = row[0][:tmp_index]
                        f.push(
                            {
                                "target": tmp_target.strip(),
                                "type": 2,
                                "title": row[1].strip(),
                                "publish_time": row[2].strip(),
                            }
                        )
                    else:
                        f.push(
                            {
                                "target": row[0].strip(),
                                "type": 2,
                                "title": row[1].strip(),
                                "publish_time": row[2].strip(),
                            }
                        )
                elif row[0].startswith(r"http://guba.sina.com.cn"):
                    f.push(
                        {"target": row[0].strip(), "type": 3, "title": row[1].strip(), "publish_time": row[2].strip()}
                    )

            LogRec.get_logger(Config.INFLOGGER).info(u"准备写入数据库")

            while f.task_left():
                res_map = f.pop()

                # print res_map['info']
                self.__write_to_db(sqlUtil, res_map, category)

                time.sleep(0.5)

            sqlUtil.destory()

            LogRec.get_logger(Config.INFLOGGER).info(u"数据库写入完毕")

        except Exception, e:
            info = sys.exc_info()
            err_logger = LogRec.get_logger(Config.ERRLOGGER)

            for file, lineno, function, text in traceback.extract_tb(info[2]):
                err_str = file, "line:", lineno, "in", function
                err_logger.error(err_str)
            err_str = "** %s: %s" % info[:2]
            err_logger.error(err_str)
Exemplo n.º 24
0
from Frontier import Frontier
from Fetcher import Fetcher
import redis

REDIS_HOST = 'localhost'
REDIS_PORT = 6379

r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)

url_frontier = Frontier(redis_client=r)
url_frontier.add_seed(
    ['https://unsplash.com/', 'https://news.ycombinator.com/'])
url_frontier.start_seeding()

pipelines = url_frontier.get_frontier()

fetcher = Fetcher(redis_client=r)
for pipeline in pipelines:
    fetcher.fetch(pipeline)
Exemplo n.º 25
0
 def __init__(self):
     self.fetcher = Fetcher()
     self.proxies_pool = set()
     self.init_pool()
     gevent.spawn(self.fetch_forever)
     gevent.spawn(self.test_forever)
Exemplo n.º 26
0
def work(bbd_type):
    conf_file = "DBConfig.ini"
    src_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    des_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'data_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'data_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'data_db', 'port'))
        }

    from CommonLib.Logging import Logging
    log = Logging(name=bbd_type)
    log.info("Process begin")

    bbd_type = bbd_type.lower()
    queue_name = bbd_type

    module_name = bbd_type.capitalize() + "Handler"
    handler = ClassFactory.getClassInst(module_name,
                                        package_name="xgxx",
                                        pinyin=bbd_type.lower())

    bbd_table = bbd_type + "_data"
    bbd_src_table = bbd_table + "_src"
    err_table = bbd_table + "_error"
    # html_normal_table = bbd_type+"_src"+"_nb"

    des_db_inst = DBManager.getInstance(des_db_dict["type"],
                                        bbd_table,
                                        host=des_db_dict["host"],
                                        port=des_db_dict["port"])  # 存 解析后数据
    err_db_inst = DBManager.getInstance(src_db_dict["type"],
                                        err_table,
                                        host=src_db_dict["host"],
                                        port=src_db_dict["port"])

    if bbd_type == 'jyyc':
        while True:
            for province in [
                    'anhui', 'beijing', 'chongqing', 'fujian', 'gansu',
                    'guangdong', 'guizhou', 'hainan', 'hebei', 'heilongjiang',
                    'henan', 'hubei', 'hunan', 'jiangsu', 'jiangxi', 'jilin',
                    'liaoning', 'neimenggu', 'ningxia', 'qinghai', 'shanghai',
                    'shangxixian', 'sichuan', 'tianjin', 'xinjiang', 'xizang',
                    'yunnan', 'zhejiang', 'zongju', 'shandong'
            ]:
                jyyc_queue = 'jyyc_{}'.format(province)
                fetch = Fetcher(jyyc_queue,
                                "xgxx",
                                get_db_dict=src_db_dict,
                                save_db_dict=des_db_dict)
                while True:
                    try:
                        source_dict = fetch.get()
                        if source_dict:
                            res_dict = UniField.cloneNeedColumns(source_dict)
                            res_dict = handler.parse(source_dict, res_dict,
                                                     province)
                            if res_dict["status"] == 0:
                                res_dict = UniField.unifyParseResult(
                                    res_dict, bbd_table=bbd_table)
                                des_db_inst.changeTable(bbd_table)
                                des_db_inst.save(res_dict)
                                log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ",
                                         bbd_table, str(des_db_inst.size()))
                                des_db_inst.changeTable(bbd_src_table)
                                des_db_inst.save(source_dict)
                                log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ",
                                         bbd_src_table,
                                         str(des_db_inst.size()))
                            else:
                                source_dict["data"] = res_dict
                                err_db_inst.save(source_dict)
                        else:
                            log.info(u"解析%s队列为空, 进入下一队列", jyyc_queue)
                            break
                    except Exception as e:
                        log.info(str(e))
                        source_dict["data"] = res_dict
                        err_db_inst.save(source_dict)
                        raw_input('ssss')
                        raise Exception(e)
            log.info(u'解析完一轮, 一个小时后进入下一轮')
            time.sleep(1 * 60 * 60)

    fetch = Fetcher(queue_name,
                    "xgxx",
                    get_db_dict=src_db_dict,
                    save_db_dict=des_db_dict)  # debug
    while True:
        try:
            source_dict = fetch.get()
            if source_dict:
                res_dict = UniField.cloneNeedColumns(source_dict)
                # log.info("start to a new seed %s",seed_dict)
                res_dict = handler.parse(source_dict, res_dict)
                if res_dict["status"] == 0:
                    res_dict = UniField.unifyParseResult(res_dict,
                                                         bbd_table=bbd_table)
                    des_db_inst.changeTable(bbd_table)
                    des_db_inst.save(res_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table,
                             str(des_db_inst.size()))
                    des_db_inst.changeTable(bbd_src_table)
                    des_db_inst.save(source_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table,
                             str(des_db_inst.size()))
                else:
                    source_dict["data"] = res_dict
                    err_db_inst.save(source_dict)
            else:
                log.info(u"解析%s队列为空, 等待10秒重试", bbd_type)
                time.sleep(10)
        except Exception as e:
            log.info(str(e))
            source_dict["data"] = res_dict
            err_db_inst.save(source_dict)
            raise Exception(e)
Exemplo n.º 27
0
def index(count=10):
    tweets = []
    for tweet in Fetcher.fetch("hyperbole", int(count)):
        tweets.append((tweet, classifier.classify(tweet)))
    return render_template('index.html', tweets=set(tweets), count=count)
Exemplo n.º 28
0
from Fetcher import Fetcher
from ProcessManager import ProcessManager


fetcher = Fetcher()

proc_attrs = {
             'online': {'func': fetcher.fetch_online_players, 'arg': 10},
             'highscores': {'func': fetcher.fetch_highscores, 'arg': 2*60*60}
            }

if __name__ == '__main__':
    pm = ProcessManager()
    pm.run(proc_attrs, 10)


Exemplo n.º 29
0
def work(bbd_type):
    conf_file = "DBConfig.ini"
    src_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    des_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'data_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'data_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'data_db', 'port'))
        }

    from CommonLib.Logging import Logging
    log = Logging(name=bbd_type)
    log.info("Process begin")

    bbd_type = bbd_type.lower()
    queue_name = bbd_type

    nb_module_name = bbd_type.capitalize() + "Nb" + "Handler"
    nb_handler = ClassFactory.getClassInst(nb_module_name,
                                           package_name="Parser",
                                           pinyin=bbd_type.lower())

    bbd_table = "qyxx_data_nb"
    bbd_src_table = "qyxx_html_nb"
    normal_table = bbd_type + "_data" + "_nb"
    err_table = normal_table + "_error"
    # html_normal_table = bbd_type+"_src"+"_nb"

    des_db_inst = DBManager.getInstance(des_db_dict["type"],
                                        bbd_table,
                                        host=des_db_dict["host"],
                                        port=des_db_dict["port"])  #存 解析后数据
    err_db_inst = DBManager.getInstance(src_db_dict["type"],
                                        err_table,
                                        host=src_db_dict["host"],
                                        port=src_db_dict["port"])

    fetch = Fetcher(queue_name + "_nbxx",
                    "qyxx",
                    get_db_dict=src_db_dict,
                    save_db_dict=des_db_dict)  # debug

    while True:
        try:
            source_dict = fetch.hget()
            if source_dict:
                res_dict = UniField.cloneNeedColumns(source_dict)
                if res_dict.has_key("year"):
                    res_dict["_id"] = UniField.updateId(
                        res_dict['_id'], res_dict['year'])
                # log.info("start to a new seed %s",seed_dict)

                res_dict = nb_handler.parse(source_dict, res_dict)
                if res_dict["status"] == 0:
                    res_dict = UniField.unifyParseResult(res_dict,
                                                         bbd_table=bbd_table)
                    des_db_inst.changeTable(bbd_table)
                    des_db_inst.save(res_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table,
                             str(des_db_inst.size()))
                    des_db_inst.changeTable(bbd_src_table)
                    des_db_inst.save(source_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table,
                             str(des_db_inst.size()))
                    # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC, seed_dict)
                    # log.info(log_info)
                else:
                    source_dict["data"] = res_dict
                    err_db_inst.save(source_dict)

                    # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO, seed_dict)
                    # log.info(log_info)
            else:
                log.info(u"解析%s队列为空, 等待10秒重试", bbd_type)
                time.sleep(10)
        except Exception as e:
            log.info(str(e))
            source_dict["data"] = res_dict
            err_db_inst.save(source_dict)
            raise Exception(e)
Exemplo n.º 30
0
from Fetcher import Fetcher
from Classifier import Classifier

hyp_tweets = [('I am so hungry I could eat a horse', 'hyperbole'),
              ('I have a million things to do', 'hyperbole'),
              ('I had to walk 15 miles to school in the snow, uphill', 'hyperbole'),
              ('She is as heavy as an elephant', 'hyperbole'),
              ('He is as fat as a whale', 'hyperbole'),
              ('Like a god', 'hyperbole'),
              ('They ran like greased lightning', 'hyperbole'),
              ('My grandmother is as old as the hills', 'hyperbole'),
              ('I am dying of shame', 'hyperbole'),
              ('I had a ton of homework', 'hyperbole'),
              ('If I can’t buy that new game I will die', 'hyperbole')]

nor_tweets = [('I do not like this car', 'normal'),
              ('I like this car', 'normal'),
              ('This view is horrible', 'normal'),
              ('I feel tired this morning', 'normal'),
              ('I am not looking forward to the concert', 'normal'),
              ('The door is black', 'normal'),
              ('I love you', 'normal'),
              ('He is my enemy', 'normal')]

tweets = hyp_tweets + nor_tweets

classifier = Classifier()

classifier.train(tweets)
for tweet in Fetcher.fetch("hyperbole", 10):
  print(classifier.classify(tweet))