Exemplo n.º 1
0
def check_url(rows, progressBar=None):
    print('Process', os.getpid(), 'Count of links', len(rows))
    chunk_size = 16
    chunk_count = 0
    chunk_results = []
    total_chunks_count = len(rows) / chunk_size

    chunks = iterate_by_batch(rows, chunk_size, None)

    print('Process', os.getpid(), 'Approximately Count of chunks',
          total_chunks_count)

    for chunk in chunks:
        chunk_count += 1
        print('Process', os.getpid(), 'Approximately Chunk: ', chunk_count,
              'of', total_chunks_count)
        results = []

        for link in chunk:
            if not link:
                continue

            acceptor = link[0].value
            anchor = link[1].value
            donor = link[2].value

            if not acceptor or not donor or 'http' not in acceptor or 'http' not in donor:
                continue

            headers = {
                'user-agent':
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
            }
            results.append(
                grequests.get(donor,
                              headers=headers,
                              hooks={
                                  'response':
                                  check_acceptor_decorator(
                                      acceptor, anchor, donor)
                              }))

        results = grequests.map(results,
                                exception_handler=exception_handler,
                                size=16,
                                gtimeout=12)

        if progressBar:
            progressBar.emit(chunk_count / total_chunks_count * 100)

        for result in results:
            if result != None:
                chunk_results.append(result.link)

    print('Finished: ', os.getpid())

    return chunk_results
Exemplo n.º 2
0
    def fix_indexer_files(self, file):
        self.start_time = time()
        self.mode = 'fix_file'
        links = self.get_links(file, for_checking=True)
        self.qlogs.log("Total links: {}".format(len(links)))
        batch_size = ceil(len(links) / self.processes)
        self.qlogs.log('Batch Size: {}'.format(batch_size))

        batches = iterate_by_batch(links, batch_size, None)

        for process, batch in zip(self.processes_list, batches):
            process.set_links(batch)
            process.start()
Exemplo n.º 3
0
    def parse_xlsx_file(self, file):
        self.start_time = time()
        wb = load_workbook(file)
        ws = wb.active

        links = list(ws.iter_rows())

        batch_size = len(links) // self.processes + 1
        self.qlogs.log('Total Links Count: {}, Batch Size: {}'.format(len(links), batch_size))

        batches = iterate_by_batch(links, batch_size, None)

        for process, batch in zip(self.processes_list, batches):
            process.set_links(batch)
            process.start()
Exemplo n.º 4
0
    def start_comment_multi(self):
        self.qlogs.log("Starting parse files...")
        self.start_time = time()
        self.read_files()

        self.qlogs.log("Total donors: {}".format(len(self.donors)))

        acceptors = CycledIterator(self.acceptors)

        batch_size = ceil(len(self.donors) / self.processes)
        self.qlogs.log('Batch Size: {}'.format(batch_size))
        batches = iterate_by_batch(self.donors, batch_size, None)

        for process, batch in zip(self.processes_list, batches):
            process.set_donors(batch)
            process.set_acceptors(acceptors.get_batch(len(batch)))
            process.set_emails(self.mails)
            process.set_comments(self.text_comments)
            process.set_usernames(self.user_names)
            process.start()
Exemplo n.º 5
0
    def parse_xlsx_files(self):
        self.mode = 'indexer'
        links = []

        if not len(self.files):
            return self.qlogs.log('Please select file')

        self.qlogs.log("Starting parse files...")
        self.start_time = time()

        for file in self.files:
            links += self.get_links(file)

        self.qlogs.log("Total links: {}".format(len(links)))

        batch_size = ceil(len(links) / self.processes)
        self.qlogs.log('Batch Size: {}'.format(batch_size))
        batches = iterate_by_batch(links, batch_size, None)

        for process, batch in zip(self.processes_list, batches):
            process.set_links(batch)
            process.start()
Exemplo n.º 6
0
def check_links_in_the_workbook(filename, progressBar=None, multi=True):
    print('CPU count:', os.cpu_count())
    wb = load_workbook(filename)
    ws = wb.active

    links = list(ws.iter_rows())

    batch_size = len(links) // os.cpu_count() + 1
    print('Total Links Count: {}, Batch Size: {}'.format(
        len(links), batch_size))

    if multi:
        batches = iterate_by_batch(links, batch_size, None)

        with Pool(processes=os.cpu_count()) as pool:
            pool_result = pool.map(check_url, batches, 1)

        result = []
        for r in pool_result:
            result += r
    else:
        result = check_url(links, progressBar)

    return result
        if input.get('name') == 'wc_email':
            return True

    return False


def exception_handler(request, exception):
    print('Exception', request.url, exception)


def grequests_links(pages):
    for page in pages:
        if 'http' not in page:
            print('There are not http in {}'.format(page))
            continue

        grequest_stack.append(grequests.get(page, headers=headers,
                                            hooks={'response': response_callback},
                                            timeout=10))

if __name__ == '__main__':

    batches = iterate_by_batch(pages, batch_size, None)

    for pages in batches:
        grequest_stack = []
        grequests_links(pages)
        results = grequests.map(grequest_stack, exception_handler=exception_handler, size=20)
        print(results)
        raise Exception
    def run(self):
        self.is_started = True
        can = True
        concurency = 20
        batch_size = concurency * 5

        while can:
            total_count = 0

            batches = iterate_by_batch(self.links, batch_size, None)

            for batch in batches:
                results = []

                for link in batch:
                    if not link: continue

                    site, referer, count = link.url, link.referer, link.count

                    if not count: continue

                    link.count -= 1

                    total_count += link.count

                    if site in self.black_list and self.black_list.count(
                            site) > 5:
                        self.processed += 1
                        self.update_info()
                        continue

                    headers = {
                        'referer':
                        referer,
                        'User-Agent':
                        'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
                    }
                    results.append(
                        grequests.get(
                            site,
                            headers=headers,
                            hooks={
                                'response':
                                self.check_site_response_decorator(link)
                            },
                            timeout=10))

                try:
                    self.results = grequests.map(
                        results,
                        exception_handler=self.exception_handler,
                        size=concurency)
                except Exception as e:
                    print('Exception', str(e))
                # self.update_info()

            if not total_count:
                can = False

            print("Worker {}, Total count: {}, processed: {}".format(
                self.number, total_count, self.processed))

        self.finish()