Exemplo n.º 1
0
class Paths(object):

    search_path = crawl.Crawl(os.path.realpath(os.path.relpath('..',
                                                               __file__)))

    @property
    def root(self):
        return copy.deepcopy(self.search_path.root)

    @property
    def paths(self):
        return copy.deepcopy(self.search_path.paths)

    def prepend_path(self, *paths):
        self.prepend_paths(*paths)

    def prepend_paths(self, *paths):
        self.search_paths.append_paths(*paths)

    def append_path(self, *paths):
        self.append_paths(*paths)

    def append_paths(self, *paths):
        self.search_path.append_paths(*paths)

    def clear_paths(self):
        for path in copy.deepcopy(self.search_path.paths):
            self.search_path.remove_path(path)
Exemplo n.º 2
0
    def gen_captcha(self, batch_size=50):
        X = np.zeros([batch_size, self.height, self.width, 1])
        img = np.zeros((self.height, self.width), dtype=np.uint8)
        Y = np.zeros([batch_size, self.char_num, self.classes])
        #image = ImageCaptcha(width = self.width, height = self.height, font=self.font, font_sizes=56)
        craw = crawl.Crawl()

        while True:
            for i in range(batch_size):
                #captcha_str = ''.join(random.sample(self.characters, self.char_num))
                #img = image.generate_image(captcha_str).convert('L')

                craw.Start()
                captcha_str = craw.CaptchaString

                img = craw.CaptchaImage
                img = np.array(img.getdata())
                X[i] = np.reshape(img, [self.height, self.width, 1]) / 255.0
                for j, ch in enumerate(captcha_str):
                    Y[i, j, self.characters.find(ch)] = 1
            Y = np.reshape(Y, (batch_size, self.char_num * self.classes))
            yield X, Y
Exemplo n.º 3
0
import sys
sys.path.append(sys.path[0].replace('/list', ''))
sys.path.append(sys.path[0].replace('\\list', ''))

import crawl

crawler = crawl.Crawl('http://www.ttt.uz')

dictionary = {
    'year': ['Год выпуска', 'Год', 'Chiqarilgan Yili'],
    'country': ['Страна', 'Davlat'],
    'genre': ['Жанр', 'Janr'],
    'time': ['Время', 'Davomiyligi'],
    'director': ['Режиссер', 'Rejisyor'],
    'actors': ['В главных ролях', 'Bosh ro’lda'],
    'version': ['Версия'],
    'dev': ['Разработчик'],
    'lang': ['Язык'],
    'tbd': ['Перевод', 'Tarjima']
}
crawler.require = './/div[@class="entry-content"]'


def scrape(t):
    t.br_replacer = ' ‧ '
    t.dropFromMain('.//script')

    t.set_main('description')

    t.get_data('title', './/h1[@class="entry-title"]')
    t.get_atr('img', './/img[@class="aligncenter wp-post-image"]/@src')
Exemplo n.º 4
0
import crawl

c = crawl.Crawl()

c.getListPage()
Exemplo n.º 5
0
import os
import datetime
import crawl
import write_to_csv
import to_json
import clean_csv
import logger
import parse

if __name__ == '__main__':
    project_name = datetime.datetime.now().strftime("%d_%m_%Y")

    logger.write('Crawling start')
    web_crawler = crawl.Crawl()
    web_crawler.crawl(project_name)
    logger.write('Crawling end')

    csv_path = 'data.csv'
    to_csv = write_to_csv.ToCSV(csv_path)

    rootDir = 'thedailystar/' + project_name
    i = 0
    for current_folder, sub_folders, fileList in os.walk(rootDir):
        # Skip the folders that has bangla news
        if 'bangla' in current_folder:
            # print(current_folder)
            continue
        for file in fileList:
            if file.endswith('.html') and '__rss' not in file:
                full_path = os.path.join(current_folder, file)
                normalised_html_file_path = os.path.normpath(full_path)
Exemplo n.º 6
0
import sys
sys.path.append(sys.path[0].replace('/list', ''))
sys.path.append(sys.path[0].replace('\\list', ''))
import crawl

crawler = crawl.Crawl('http://megasoft.uz', 'windows-1251')

crawler.require = './/a[starts-with(@href, "/get/")]'

def condition(t, xpath, xpath2):
	t.mainEl = t.lxml.xpath(xpath)
	return len(t.mainEl) != 0
crawler.condition = condition

stringSel = {
	"os"   : "Система:",
	"size" : "Размер файла: ",
	"lang" : "Язык интерфейса: "
}
other = {
	"publishDate"  : "Добавлено:",
	"downloadCount": "Количество загрузок: "
}
xpathstr = {}
xpath = {}
template = './/table[@width="300"]//td[text()="{0}"]/following-sibling::td'
for x in stringSel:
	xpathstr[x] = template.format(stringSel[x])
for x in other:
	xpath[x] = template.format(other[x])
Exemplo n.º 7
0
import sys

sys.path.append(sys.path[0].replace('/list', ''))
sys.path.append(sys.path[0].replace('\\list', ''))

import crawl

crawler = crawl.Crawl('http://mytube.uz')
crawler.limited = False

crawler.require = './/div[@class="WhiteBlock CommentsBlock"]'


def scrape(t):
    t.get_data('title', './/h2')
    t.get_data('description', './/div[@id="aboutUser"]/pre')
    t.get_data('category',
               './/span[@class="userinfobox-categories-tags-container"]/a[1]')
    t.get_data_array(
        'tags',
        './/span[@class="userinfobox-categories-tags-container"]/a[not(position()=1)]'
    )
    t.get_data_int('views', './/div[@class="Views-Container"]')
    t.get_data_date('publishDate', './/div[@class="Date"]/text()[last()]')


crawler.scrape = scrape
crawler.urlNotContains.extend(('/uz/', '/oz/'))

crawler.crawl()
Exemplo n.º 8
0
import sys
sys.path.append(sys.path[0].replace('/list', ''))
sys.path.append(sys.path[0].replace('\\list', ''))
import crawl

crawler = crawl.Crawl('http://tas-ix.me/')

crawler.require = './/table[@id="topic_main"]//div[@class="post_wrap"]'
crawler.require2 = './/fieldset[@class="attach"]'


def condition(self, xpath1, xpath2):
    self.mainEl = self.lxml.find(xpath1)
    if self.mainEl is not None:
        return self.mainEl.find(xpath2) is not None
    else:
        return False


crawler.condition = condition


def scrape(t):
    t.dropFromMain('.//fieldset[@class="attach"]')
    t.dropFromMain('//div[@class="sp-body" and @title="MediaInfo"]')
    t.dropFromMain('//script')
    t.set_main('description')

    t.get_data('title', './/h1[@class="maintitle"]')
    t.get_data_array('category',
                     '(.//td[@class="nav w100"])[1]/a[not(position()=1)]')
Exemplo n.º 9
0
import sys

sys.path.append(sys.path[0].replace('/list', ''))
sys.path.append(sys.path[0].replace('\\list', ''))
import crawl

crawler = crawl.Crawl('http://topmusic.uz', 'windows-1251')


def condition(t, req1, req2):
    if '/album-' in t.url:
        crawler.scrape = albumScrape
        return True
    else:
        crawler.scrape = artistScrape
        if t.lxml.find('.//div[@id="clips_section"]') is not None:
            return True
        elif t.lxml.find('.//div[@id="singls_section"]') is not None:
            return True
        else:
            return False


def artistScrape(t):
    t.lxml = t.lxml.find('.//div[@class="box-mid"]')
    t.get_data('title', './/h2[1]')
    t.get_data('genre', './div[1]//a[1]')
    t.get_atr_array('clip', './/div[@class="clip-box"]/a[3]/@title',
                    20)  #cuts 20chars in the beginning
    t.get_atr_array('single', './/a[@class="play-track"]/@title')
Exemplo n.º 10
0
 def __init__(self):
     self.url_table = url_table.UrlTable()
     self.crawl = crawl.Crawl()
     self.webpage_parse = webpage_parse.WebPageParse()
     self.webpage_save = webpage_save.WebPageSave()
Exemplo n.º 11
0
import sys
sys.path.append(sys.path[0].replace('/list', ''))
sys.path.append(sys.path[0].replace('\\list', ''))

import crawl

crawler = crawl.Crawl('https://mover.uz')
crawler.limited = False

crawler.require = './/h1[@class="fl video-title"]'

def scrape(t):
	t.set_main('title')

	t.get_data('description', './/div[@class="desc-text"]')
	t.data['description'] = t.data['description'][:-21]
	# print t.data['description']
	t.get_data('category', './/p[@class="cat-date"]/a')
	t.get_data_array('tags', './/p[@class="tags"]/a')
	t.get_data_int('views', './/span[@class="fr views"]/strong')
	t.get_data_int('likes', './/table[@class="r-desc"]/tr/td[@class="like"]')
	t.get_data_int('dislikes', './/table[@class="r-desc"]/tr/td[@class="dislike"]')
	t.get_data_date('publishDate', './/p[@class="cat-date"]/text()[1]')



crawler.scrape   = scrape
# crawler.urlNotContains.extend()

crawler.crawl()
Exemplo n.º 12
0
import sys
sys.path.append(sys.path[0].replace('/list', ''))
sys.path.append(sys.path[0].replace('\\list', ''))
import crawl

crawler = crawl.Crawl('http://mediabox.uz/ru')

crawler.require = './/p[@class="col-lg-12 inner_title"]'
selector = {
  "release" : "Год:",
  "country" : "Страна:",
  "genre"   : "Жанр:",
  "subtitle": "Слоган:",
  "budget"  : "Бюджет:",
  "producer": "Продюсер:",
  "director": "Режиссёр:",
  "actor"   : "Актеры:",
  "lang"    : "Язык:",
  "time"    : "Время:"
}
xpath = {}
for x in selector:
  xpath[x] = './/div[@id="info"]//td[text()="'+selector[x]+'"]/following-sibling::td'

def scrape(t):
  t.set_main('title')
  t.get_atr('img', './/div[@class="cover"]/img/@src')
  for x in xpath:
    t.get_data(x, xpath[x])

  t.get_data('forAge', './/div[@id="info"]/td[contains(., "Возраст:")]/following-sibling::td/b')