示例#1
0
def hepsiburada_linkleri(key):
    session = HTMLSession()
    liste = []
    pattern = re.compile(r'p-[a-zA-Z]{1,}[0-9]{1,}[a-zA-Z0-9]{1,}'
                         )  # regex şablonumuz # p-HBV00000Y8E0O
    ###############################3
    req = session.get(URL[key])
    soup = BeautifulSoup(req.html.html, 'html.parser')
    sayfalama = soup.find('div', {'id': 'pagination'})('li')[-1].text.strip()
    ####################################
    for a in range(1, int(sayfalama)):
        try:
            url = URL[key] + "?sayfa={}".format(a)
            req = session.get(url)
            time.sleep(1.5)
            container = req.html.find('.product-list')[0]
            for i in container.absolute_links:  # tüm linkler , all links
                r = re.search(pattern, i)
                if r is not None:
                    urun_kimligi = r.group(
                    )  # çıktı, output, example: p-HBV00000Y8E0O
                    liste.append(urun_kimligi)
        except:
            break
    json_save(liste, key)
示例#2
0
def app_view_scrapy(name1, name2):
    items = {}
    for link in scrapy(name1, name2):
        url = requests.get(link)
        bs = soup(url.content, 'html.parser')
        ###
        download = bs.find('div', {'id': 'indir'}).find("a", {'itemprop': 'downloadURL'})
        baslik = bs.find('div', {'id': 'baslik'})
        app_information = bs.find('div', {'id': 'detayads'}).find_all('div', {'class': 'dl'})
        ###
        img_link = baslik.find('img', {'itemprop': 'image'})
        baslik_name = baslik.find('span', {'itemprop': 'name'}).text.replace(' ', '_').lower().replace('ı','i')
        rating_info = baslik.find('div', {'class': 'rating'}).find('strong', {'class', 'ratingval'})
        rating_star = rating_info.span.text
        rating_oy = rating_info.find('span', {'itemprop': 'ratingCount'})
        data = {
                #'virus_scan':app_information[7]('span').text, 
                'system':app_information[6]('span')[-1].text,
                'update':app_information[5]('span')[0].text,
                'interface': app_information[4]('span')[0].text.strip().replace('\n','').replace('İ','i').replace('ç','c').replace('I','i').replace('ü','u'),
                'person':app_information[3]('span')[0].text.strip().replace('\n','').replace('ç','c'),
                'size':app_information[2].text[6:].strip().replace('\n',''),
                'license' : app_information[0]('span')[0].text.strip().lower().replace('\n','').replace('Ü','u').replace('ç','c').replace('ü','u'),
                'publisher' : app_information[1]('span')[0].text.lower(),
                'rating_star': rating_star,
                'rating': [rating_oy.text if rating_oy is not None else '0'][0],
                'img_link': [img_link['src'] if img_link is not None else None][0],
                'download_link': [download.get('onclick')[11:-13] if download is not None else None][0],
        }
        items[baslik_name] = data
    json_save(items,"unix-linux")
    return items
示例#3
0
def tum_kategori_linkleri():
    session = HTMLSession()
    r = session.get('https://www.hepsiburada.com/tum-kategoriler')
    soup = BeautifulSoup(r.html.html, 'html.parser')
    a = 0
    items = {}
    while True:
        try:
            container = soup.find_all('div', class_='categories')
            group = container[a].find_all('div', class_='group')
            for i in group:
                anabaslik = i.find('a')
                if anabaslik is not None:
                    it = {}
                    for j in i('a'):
                        baslik = j.text.lower().replace('&', '').replace(
                            ' ',
                            '').replace('/',
                                        '').replace('ö',
                                                    'o').replace('ü', 'u')
                        link = "https://www.hepsiburada.com" + str(
                            j.get('href'))
                        it[baslik] = link
                    items[anabaslik.text.lower().replace('.',
                                                         '').replace('/',
                                                                     '')] = it
            a += 1
        except:
            break
    json_save(items, 'tüm_kategoriler')
示例#4
0
def android_and_ios_scrapy(kategori,key, path):
    android = "https://www.gezginler.net/android/"
    ios = "https://www.gezginler.net/ios/"
    if kategori == 'android':
        kategori = android
    elif kategori == 'ios':
        kategori = ios
    else:
        print('Sadece android ve ios kategorisi vardır...')
        return 0
    url =  kategori + key
    driver = webdriver.Chrome()
    driver.get(url)
    height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == height:
            break
        height = new_height
    time.sleep(1)
    source = driver.page_source
    driver.close()
    bs = soup(source,'html.parser')
    items = {}
    for app in bs.find_all('div',{'class':'masonry-brick'}):
        href = app.find('a').get('href')
        title = app.find('a').strong.text
        title = title.lower().replace(' ','_').replace('-','').replace('!','').replace('ü','u').replace('ç','c').replace('İ','i').replace('ğ','g').replace('ö','o')
        #info = app.find('a')[-1].text
        items[title.replace('ş','s').replace('ı','i')] = href
    json_save(items, path)
示例#5
0
def lncanon_characters_scrapy():
    req = requests.get(URL['characters']['lncanon'], headers=HEADERS)
    soup = bs(req.content, 'html.parser')
    items = {}
    for x in range(2):
        for tr in soup('table', class_='wikitable')[x]('tr')[1:]:
            name = tr('td')[1].text
            data = {
                'name_url':
                'https://onepiece.fandom.com' + tr('td')[1].a.get('href'),
                'chapter_no':
                tr('td')[2].text.replace('\n', ''),
                'chapter_no_url': [
                    'https://onepiece.fandom.com' + tr('td')[2].a.get('href')
                    if tr('td')[2].a is not None else None
                ][0],
                'epsido_no':
                tr('td')[3].text.replace('\n', ''),
                'epsido_no_url': [
                    'https://onepiece.fandom.com' + tr('td')[3].a.get('href')
                    if tr('td')[3].a is not None else None
                ][0],
                'year':
                tr('td')[4].text.replace('\n', ''),
                'note':
                tr('td')[5].text.replace('\n', '')
            }
            items[name.lower().replace(' ', '_').replace('\n', '')] = data
    json_save(items, 'lncanon')
示例#6
0
文件: vocab.py 项目: liuqiskan/nlp
    def create(self, text_lines):
        all_word = []
        for sent in text_lines:
            for word in sent:
                all_word.append(word)

        self.__vocab_list.extend(list(sorted(set(all_word))))
        word_to_index = {word: i for i, word in enumerate(self.__vocab_list)}
        u.json_save(c.vocab_path, word_to_index)
示例#7
0
def dizi_listesi_scrapy():
    # dizilistesi/yerlidizi.json
    # dizilistesi/trdublaj.json
    # dizilistesi/anime.json
    # dizilistesi/asya.json
    # dizilistesi/yerlidizi.json
    with open('dizilistesi/yerlidizi.json', 'r') as f:
        data = json.load(f)
        json_save(view_scrapy(data),'data/yerlidizi_')
示例#8
0
def dizi_listesi(file_name):
    req = requests.get(URL[file_name])
    soup = bs(req.content, 'html.parser')
    items = {}
    for i in soup.find('div', {'class': 'dizi_turleri'})('a'):
        name = i.font.text.lower().replace('ü', 'u').replace('ğ', 'g').replace('ı', 'i').replace('ç', 'c').replace('•', '').replace('!', '').replace('(anime)', '')
        link = i.get('href')
        items[name.strip().replace(':', '.').replace('.', '').replace('(tã¼rkã§e dublaj)', '')] = link
    json_save(items,"dizilistesi/"+ file_name)
示例#9
0
def organizations_marines_scrapy():
    req = requests.get(URL['organizations']['marines'], headers=HEADERS)
    soup = bs(req.content, 'html.parser')
    items = {}
    for i in soup('div', class_='Gallery-pic'):
        name = i.a.get('title')
        # data = {
        #     'name_url': ,
        #     #'image_url':i.img.get('data-src')
        # }
        items[name.lower()] = 'https://onepiece.fandom.com/' + i.a.get('href')
    json_save(items, 'marines')
示例#10
0
def pirate_crews_scrapy():
    req = requests.get(URL['pirate_crews']['whitebeard'], headers=HEADERS)
    soup = bs(req.content, 'html.parser')
    items = {}
    liste = [
        'status', 'age', 'birthday', 'height', 'bounty', 'occupations',
        'epithet', 'japanese_name', 'devil_fruit', 'fruit'
    ]
    #print(soup('table',class_='cs')[0]('th')[0].text)
    for j in soup('div', class_='Gallery-pic')[:-2]:
        name = j.a.get('title')
        name_url = 'https://onepiece.fandom.com' + j.a.get('href')
        rq = requests.get(name_url, headers=HEADERS)
        view_soup = bs(rq.content, 'html.parser')
        aside = view_soup('aside')[0]
        # data verisinin içinde önce listedeki elemanları none olarak atıyoruz sebebi ise bazı karakterlerin bazı özellikleri yok
        data = {}
        for li in liste:
            data[li] = None
        try:
            section0 = aside('section', class_='pi-item')[0]
            section1 = aside('section', class_='pi-item')[1]
        except:
            section1 = ''
        for i in section0('div')[:-2]:
            if i('h3') != []:
                h3 = i('h3')[0].text.lower().replace(':', '').replace(' ', '')
                for h3_name in liste:
                    if h3 == h3_name:
                        print(h3_name, i('div')[0].text)
                        data[h3_name] = sup_cite_parser2(
                            i('div')[0].text.replace('\"', '')).strip()
                        if section1 == '':
                            data['devil_fruit'] = None
                        else:
                            data['devil_fruit'] = 'yes'
                            data['fruit'] = sup_cite_parser2(
                                section1('div')[0].find('div').text)
        items[name] = data
    json_save(items, 'whitebeard')
示例#11
0
def straw_hat_prites_galery_scrapy(name, path):
    session = HTMLSession()
    req = session.get(f'https://onepiece.fandom.com/wiki/{name}/Gallery')
    titles = bs(req.html.find('#toc')[0].html, 'html.parser')
    items = {}
    a = 0
    for x in range(2, len(titles('li')) * 2 - 1, 2):
        image_table = req.html.xpath(
            f'//*[@id="mw-content-text"]/div/table[2]/tbody/tr/td/div[{x}]')
        soup = bs(image_table[0].html, 'html.parser')
        liste = []
        for i in soup('span'):
            data = {
                'explanation':
                i('div')[0].text.lower().replace('\"', '').replace('\n', ''),
                'img_url':
                i('td')[0].a.get('href')
            }
            liste.append(data)
        items[req.html.find('.mw-headline')[a].text.lower().replace(
            '&', '-').replace(' (', '_').replace(')', '')] = liste
        a += 1
    json_save(items, path)
示例#12
0
def straw_hat_prites_scrapy(
        path):  #https://onepiece.fandom.com/wiki/Straw_Hat_Pirates --> table
    url = URL['strawhatpirates']
    req = requests.get(url, headers=HEADERS)
    soup = bs(req.content, 'html.parser')
    table = soup('table', class_='sortable')
    items = {}
    for i in range(1, len(table[0]('tr')), 2):
        name = table[0]('tr')[i].td.text.strip()
        unvan = table[0]('tr')[i]('td')[5]
        item = {
            'wiki_url':
            'https://onepiece.fandom.com' + table[0]('tr')[i].td.a.get('href'),
            'profession':
            table[0]('tr')[i]('td')[1].text.lower().strip(),
            'capabilities':
            table[0]('tr')[i]('td')[2].text.strip().lower().replace(
                "", '').split('\n'),
            'epithet':
            [unvan.b.text.replace('"', '') if unvan.b else unvan.text
             ][0].lower()
        }
        items[name] = item
    json_save(items, path)
示例#13
0
def non_canon_characters_scrapy():
    req = requests.get(URL['characters']['noncanon'], headers=HEADERS)
    soup = bs(req.content, 'html.parser')
    items = {}
    for tr in soup('table', class_='wikitable')[0]('tr')[1:]:
        name = tr('td')[1].text
        data = {
            'name_url':
            'https://onepiece.fandom.com' + tr('td')[1].a.get('href'),
            'type':
            tr('td')[2].text,
            'number':
            tr('td')[3].text,
            'year':
            tr('td')[4].text,
            'appears_in':
            tr('td')[5].text.replace('\n', ''),
            'appears_in_url': [
                'https://onepiece.fandom.com' + tr('td')[5].a.get('href')
                if tr('td')[5].a is not None else None
            ][0]
        }
        items[name.lower()] = data
    json_save(items, 'noncanon1')
示例#14
0
def rewrite_site_projects(files_and_data):
    parents_seen = set()
    files_seen = set()

    for file in files_and_data.keys():
        parent_dir = pathlib.Path(*file.parts[:3])
        if parent_dir not in parents_seen:
            for file in (WORKING_DIR / parent_dir).glob('*.json'):
                files_seen.add(file.relative_to(WORKING_DIR))
            parents_seen.add(parent_dir)
    
    for file, data in files_and_data.items():
        json_save(data, WORKING_DIR / file)
    
    files_to_delete = files_seen.difference(files_and_data)
    print(f'{len(files_to_delete)} files to be deleted')
    print('Committing updated projects')
    try:
        git_fs.update_localization(list(files_and_data), list(files_to_delete))
    except Exception as e:
        logging.exception(e)
        logging.error('An error occured but this may not be a problem')
    print('Creating PR')
    git_fs.publish_localization(list(files_and_data), list(files_to_delete))
示例#15
0
def update_segment(segment, user):
    """
    segment looks like:
    {
      "segmentId": "dn1:1.1",
      "field": "translation-en-sujato",
      "value": "..", "oldValue": "..."
    }
    """

    segment_id = segment["segmentId"]

    if not is_id_legal(segment_id):
        logging.error(f"Invalid Segment ID: {segment_id}")
        return {"error": "Invalid Segment ID"}

    uid, _ = segment_id.split(":")
    parent_uid = get_parent_uid(uid)

    long_id = f'{parent_uid}_{segment["field"]}'
    try:
        filepath = get_file_path(long_id)
    except KeyError as e:
        logging.exception(e)
        logging.error('f"{long_id}" not found, {segment}')
        return {"error": "file not found"}

    file = get_file(filepath)

    permission = get_permissions(filepath, user['login'])
    if permission != Permission.EDIT:
        logging.error("User not allowed to edit")
        return {"error": "Inadequate Permission"}

    with git_fs._lock:
        try:
            file_data = json_load(file)
        except FileNotFoundError:
            file.parent.mkdir(parents=True, exist_ok=True)
            file_data = {}
        current_value = file_data.get(segment_id)
        result = {}
        if current_value and current_value != segment.get("oldValue"):
            result["clobbered"] = current_value

        if current_value != segment["value"]:
            result["changed"] = True

        file_data[segment_id] = segment["value"]

        sorted_data = dict(sorted(file_data.items(), key=bilarasortkey))

        try:
            json_save(sorted_data, file)
            result["success"] = True
        except Exception:
            logging.exception(f"could not write segment: {segment}")
            return {"error": "could not write file"}

        executor.submit(background_update, filepath, user, segment)
        return result