Python Processor.changeUrl примеры использования

Язык программирования: Python

Пространство имен/Пакет: Processor

Класс/Тип: Processor

Метод/Функция: changeUrl

Примеров на hotexamples.com: 1

Python Processor.changeUrl - 1 пример найден. Это лучшие примеры Python кода для Processor.Processor.changeUrl, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Processor(30)

run(9)

detect(5)

post_process(3)

trainClassifier(2)

push_input(2)

extract_class_grids(2)

extract_object_grids(2)

passMat(2)

process(2)

generar_mensaje(2)

extract_boxes(2)

getCPI(2)

getImage(2)

load_processor(2)

__init__(2)

peel(1)

object_sort(1)

output(1)

main(1)

load_all(1)

limpiar_mensaje(1)

letras_consecutivas_repetidas(1)

n_core(1)

plot_pca_spectrum(1)

perform_pca(1)

reduce_dimension(1)

solve(1)

show_labels(1)

runCacti(1)

reset(1)

refine_bbox(1)

read_tiff(1)

plot_clusters(1)

read_output(1)

process_quote_arguments(1)

processImage(1)

predict(1)

insert_processor_mode(1)

plot_grid_search(1)

intra_cluster_average(1)

ConvolutionPlot(1)

insert_mode(1)

delete_table(1)

Init(1)

Receive(1)

add_virus(1)

advancedSolve(1)

analyzer(1)

Пример #1

Показать файл

Файл: Crawler.py Проект: BoscoTin/MATH4992-Project

class Crawler:
    def __init__(self, numOfLayer):
        self.num = numOfLayer
        self.parent = []
        self.children = []
        self.handled = []
        self.Indexer = Indexer()
        self.Processor = Processor()
        self.Porter = PorterStemmer()
        self.db = []

        link = "http://www.cse.ust.hk/"
        self.parent.append(link)

    def handleLink(self, links):
        processedLinks = self.Processor.waiveUnrelatedDomain(links)
        processedLinks = self.Processor.clearSubfix(processedLinks)
        processedLinks = self.Processor.clearUnwantedFiles(processedLinks)
        processedLinks = self.Processor.changeUrl(processedLinks)

        return self.Processor.clearDuplicate(processedLinks)

    def getOnePage(self):
        parent = self.parent.pop(0)

        if parent not in self.handled:
            print ""
            print "Searching {}".format(parent)
            try:
                request = requests.get(parent, timeout=20)
                # check if the page can be connected successfully
                if request.status_code == requests.codes.ok:
                    soup = BeautifulSoup(request.text, 'html.parser')
                    # get all child links from the site
                    children = []
                    for link in soup.findAll('a', href=True):
                        children.append(urljoin(parent, link.get('href')))

                    children = self.handleLink(children)
                    for child in children:
                        try:
                            mynewstring = child.encode('ascii')
                            print mynewstring
                        except UnicodeEncodeError:
                            print("there are non-ascii characters in there")
                        self.children.append(child)

                    # words exist in database?
                    if parent not in self.db:
                        # get raw text and split it
                        rawtags = soup.find_all('p')
                        temp = []
                        for tag in rawtags:
                            temp = temp + tag.getText().split()
                        # replace punctuation by white space and split
                        words = []
                        for word in temp:
                            rawtext = word.encode('utf-8').strip()
                            rawtext = "".join(i for i in rawtext
                                              if ord(i) < 128)
                            for c in string.punctuation:
                                rawtext = rawtext.replace(c, " ")
                            words += rawtext.split()
                        # process the words
                        processedWords = []
                        for word in words:
                            processedWords.append(self.Porter.stem(word))
                        # give the data to indexer
                        if len(processedWords) != 0:
                            self.Indexer.process(parent, processedWords,
                                                 children)
                        else:
                            print "The document contains no word"

            except requests.exceptions.ConnectionError:
                print "Error in connecting the site."
            except requests.exceptions.Timeout:
                print "Timeout in connecting the site."

        self.handled.append(parent)

    # search by BFS
    def scrape(self):
        all = db.getAll()
        for instance in all:
            self.db.append(instance['url'])
        print len(self.handled)

        for i in range(self.num):
            self.parent = self.handleLink(self.parent)
            print ""
            print "Searching layer {}".format(i)

            if (len(self.parent) == 0):
                break

            for i in range(len(self.parent)):
                self.getOnePage()

            self.parent = self.children
            self.children = []