Пример #1
0
def split_files(search_key):
    """
    Loops through files, splitting Rashi and ktav yad rashi into 2 different files.
    Recommend running check_demarcation first.
    :param search_key: key to find end of Rashi and beginning of ktav yad rashi
    """

    # loop through files
    for page in range(functions.get_page(72, 'b'), functions.get_page(94, 'a') + 1):
        file_name = u'מנחות_{}.txt'.format(functions.get_daf(page))
        rashi = codecs.open(u'rashi_fixed/{}'.format(file_name), 'w', 'utf-8')
        ktav_yad_rashi = codecs.open(u'ktav_yad_rashi/{}'.format(file_name), 'w', 'utf-8')
        original = codecs.open(file_name, 'r', 'utf-8')

        found = False

        for line in original:

            if line.find(search_key) != -1:
                found = True

            if not found:
                rashi.write(line)
            if found:
                ktav_yad_rashi.write(line)

        original.close()
        rashi.close()
        ktav_yad_rashi.close()
Пример #2
0
def check_demarcation(search_key):
    """
    Sanity check function: make sure a certain search key can be used to find the beginning of the ktav yad rashi in
    text. Prints out files missing the search key, as well as number of files searched and number of keys found.
    :param search_key: A string indicating where ktav yad rashi begins.
    """

    total, count = 0, 0

    # loop through files
    for page in range(functions.get_page(72, 'b'), functions.get_page(94, 'a')+1):
        file_name = u'מנחות_{}.txt'.format(functions.get_daf(page))
        rashi_file = codecs.open(file_name, 'r', 'utf-8')
        total += 1

        found_key = False
        for line in rashi_file:
            if line.find(search_key) != -1:
                found_key = True
                count += 1
                break

        if not found_key:
            print file_name

        rashi_file.close()

    print '{} files scanned, found key in {} file'.format(total, count)
Пример #3
0
 def get_plist(self, url, st):
     # Create playlist
     content = func.get_page(url).text
     # New bs object
     soup = BeautifulSoup(content, 'lxml')
     # print(soup)
     # exit(1)
     plist = soup.find(name='ul', attrs={'class': 'f-hide'})
     # Pass the csv file name according to the toggle
     if st.toggle == False:
         st.csv_fname = st.playlist_title = soup.find(name='h2',
                                                      class_='f-ff2').string
     # Filter data
     for song in plist.find_all(name='li'):
         # id
         id = re.search('=([0-9]+)', song.a['href'])
         # Avoid repetitive recording of song names
         id_foo = id.group(1)
         if id_foo not in self.songs['id']:
             self.songs['id'].append(id_foo)
             # name
             song_name = song.a.string
             self.songs['name'].append(song_name)
             # url
             song_url = 'https://music.163.com' + song.a['href']
             self.songs['url'].append(song_url)
def extract_degree_info():
    '''
    Extract and store information about individual degrees by visiting degree URLs.
    '''
    degrees = []

    for url in degree_urls:
        soup = get_page(url)
        if not soup:
            continue
        degree = {'url': url, 'source': 'Edison Project University Programs List'}

        # Degree name
        a = soup.header.h1.find('a', string=True)
        if a:
            degree['degree'] = a.string

        try:
            strings = [tag for tag in soup.main.stripped_strings]
            for i, string in enumerate(strings):
                if string.split()[0].endswith(':'):
                    degree[string.strip(':').lower()] = strings[i+1]
        except AttributeError:
            pass

        degrees.append(degree)
        to_json(degree)

    return degrees
Пример #5
0
 def get_lyric(self):
     """获得歌词"""
     self.songs['lyric'] = []
     total = len(self.songs['id'])
     for song_id in self.songs['id']:
         url = 'http://music.163.com/api/song/lyric?os=pc&id=' \
               + song_id \
               + '&lv=-1&kv=-1&tv=-1'
         # 获得歌词内容
         content = func.get_page(url).json()
         if 'lrc' in content and 'nolyric' not in content and content[
                 'lrc'] is not None:
             lyric = content['lrc']['lyric']
             # 清洗歌词
             lyric = re.sub('\[.*?\]', '', lyric)
             self.songs['lyric'].append(lyric)
             self.only_lyric.append(lyric)
             print('completed ' + str(
                 round(self.songs['id'].index(song_id) / total * 100, 2)) +
                   '% ',
                   end='')
             print('added lyric id: ' + song_id)
         else:
             # 填充,避免出现浮点数的空值
             self.songs['lyric'].append('ThisShallBeIgnored')
Пример #6
0
def extract_degree_info():

    degrees = []
    url = 'https://analytics.ncsu.edu/?page_id=4184'
    soup = get_page(url)
    if not soup:
        return

    degree_header = soup.find(
        string="CHRONOLOGY OF GRADUATE PROGRAMS IN ANALYTICS AND DATA SCIENCE")
    p_tags = degree_header.find_all_next('p')
    for tag in p_tags:

        degree = {'url': url, 'source': 'Analytics NCSU'}

        # Degree title
        a = tag.find('a', string=True)
        if a:
            degree['degree'] = a.string

        # Degree URL
        a = tag.find('a', href=True)
        if a:
            degree['url'] = a.get('href')

        # First year of enrollment
        try:
            h3 = tag.find_previous('h3')
            degree['first_enrolled'] = int(
                h3.string.replace('\u2022', '').strip())
        except ValueError:
            degree['first_enrolled'] = None

        # University and department are found in a single string separated by commas.
        try:
            # Remove link tag to access university and department string in p tag.
            tag.a.decompose()
            text = tag.string.strip().split(',')
            text = [value.strip() for value in text if value is not '']
        except AttributeError:
            degree['university'] = None
            degree['department'] = None
        else:
            # Values can be assigned with certainty when there are only two commas.
            if len(text) is 2:
                degree['university'] = text[0]
                degree['department'] = text[1]

            # Otherwise (like when a university name includes a comma), both fields are
            # assigned the entire string and can be cleaned up later.
            else:
                degree['university'] = ','.join(text)
                degree['department'] = ','.join(text)

        degrees.append(degree)

    return degrees
def extract_degree_links():
    '''
    Find URLs that lead to degree pages.
    '''
    degree_urls = []
    for url in source_urls:
        soup = get_page(url)
        if soup:
            for td in soup.find_all('td', class_='views-field views-field-title'):
                url = td.find('a', href=True).get('href')
                degree_urls.append('http://edison-project.eu' + url)

    return degree_urls
def extract_degree_info():

    degrees = []
    url = 'https://www.datasciencegraduateprograms.com/school-listing/#context/api/listings/prefilter'
    soup = get_page(url)
    if not soup:
        return

    for tag in soup.find('div', class_='stateheader-departments').find_all_next('a', href=True):

        degree = {'url': url, 'source': 'Data Science Graduate Programs'}

        # Degree title
        parent = tag.parent
        if parent:
            degree['degree'] = parent.string

        # Degree university
        h3 = tag.find_previous('h3', string=True)
        if h3:
            degree['university'] = h3.string

        # Degree department
        strong = tag.find_previous('strong', string=True)
        if strong:
            degree['department'] = strong.string

        # Degree URL
        degree['url'] = tag.get('href')

        # Degree state
        h2 = tag.find_previous('h2', string=True)
        if h2:
            degree['state'] = h2.string

        # Misc. properties of degree
        ul = tag.find_next('ul')
        if ul:
            degree['properties'] = [li.string for li in ul.find_all('li')]

        # Degree accredidation info
        em = tag.find_next('em', string=True)
        if em:
            degree['accredidation'] = em.string

        degrees.append(degree)
        to_json(degree)

    return degrees
Пример #9
0
 def get_playlists(self, st):
     try:
         with open('res/' + st.search_keyword + '.json',
                   encoding='UTF-8') as f:
             p_json = json.load(f)
     except FileNotFoundError:
         url = 'http://music.163.com/api/search/get/web?csrf_token=hlpretag=&hlposttag=&s={' \
               + st.search_keyword + '}&type=1000&offset=0&total=true&limit=' + str(st.result_limit)
         p_json = func.get_page(url).json()
         with open('res/' + st.search_keyword + '.json',
                   'w',
                   encoding='UTF-8') as k:
             text = json.dumps(p_json, ensure_ascii=False)
             k.write(text)
     result = p_json['result']
     self.playlists = result['playlists']
Пример #10
0
 def update_fund_code_info(self):
     content = functions.get_page(
         "http://fund.eastmoney.com/js/fundcode_search.js").content.decode(
         )
     # Error: json.decoder.JSONDecodeError: Unexpected UTF-8 BOM (decode using utf-8-sig): line 1 column 1 (char 0)
     content = content.replace("var r = ", "", 1)
     content = content.replace(";", "", 1)
     content = content.encode().decode('utf-8-sig')
     content = json.loads(content)
     # with open("resource/fundcode_info/fundcode_search.js", 'rb') as json_obj:
     #     content = json.load(json_obj)
     df = pd.DataFrame(content,
                       columns=["CODE", "WORD", "NAME", "TYPE", "PINYIN"])
     df.set_index("CODE", inplace=True)
     df = df.drop('PINYIN', axis=1)
     df.to_csv("resource/fundcode_info/fundcode_search.csv",
               encoding="UTF-8")
Пример #11
0
    def get_page_data(self, data, url, params):
        #从网页中获取数据
        content = functions.get_page(url, params=params).content
        soup = BeautifulSoup(content, "lxml")

        tbody = soup.find(name="tbody").find_all(name="tr")
        for tdays in tbody:
            tds = tdays.find_all(name="td")
            if tds[0].text == "暂无数据!":
                continue
            date = tds[0].text
            data[date] = []
            if date not in self.nw_df["DATE"].tolist():
                self.nw_df = self.nw_df.append({"DATE": date},
                                               ignore_index=True)
            if date not in self.aw_df["DATE"].tolist():
                self.aw_df = self.aw_df.append({"DATE": date},
                                               ignore_index=True)
            if date not in self.wi_df["DATE"].tolist():
                self.wi_df = self.wi_df.append({"DATE": date},
                                               ignore_index=True)

            nw_index = self.nw_df[self.nw_df["DATE"] == date].index.tolist()
            aw_index = self.aw_df[self.aw_df["DATE"] == date].index.tolist()
            wi_index = self.wi_df[self.wi_df["DATE"] == date].index.tolist()
            for i in range(len(tds)):
                data[date].append(tds[i].text)
                flag = False
                if i == 1:
                    self.nw_df.loc[nw_index, params["code"]] = tds[i].text
                    flag = True
                if i == 2:
                    self.aw_df.loc[aw_index, params["code"]] = tds[i].text
                    flag = True
                if i == 3:
                    self.wi_df.loc[wi_index, params["code"]] = tds[i].text
                    flag = True
                if flag:
                    self.progress += 0.3333333
        return data
Пример #12
0
    def get_lyric(self):
        """获得歌词"""
        self.songs['lyric'] = []
        total = len(self.songs['id'])
        for song_id in self.songs['id']:
            url = 'http://music.163.com/api/song/lyric?os=pc&id=' \
                  + song_id \
                  + '&lv=-1&kv=-1&tv=-1'
            # Get lyrics content
            content = func.get_page(url).json()
            # print(content)
            # exit(1)
            if 'lrc' in content and 'nolyric' not in content and content[
                    'lrc'] is not None:
                lyric = content['lrc']['lyric']
                # Clean the lyrics, clean the time, clean the arrangement, etc.
                lyric = re.sub('\[.*?\]', '', lyric)
                templist = lyric.split('\n')
                lyric = ''

                for t in templist:
                    #print(len(re.findall(':', t)))
                    if len(re.findall(':', t)) != 0 or len(re.findall(':',
                                                                      t)) != 0:
                        continue
                    else:
                        lyric = lyric + t + '\n'
                # print(lyric)
                # exit(1)
                self.songs['lyric'].append(lyric)
                self.only_lyric.append(lyric)
                print('completed lyric' + str(
                    round(self.songs['id'].index(song_id) / total * 100, 2)) +
                      '% ',
                      end='')
                print('added lyric id: ' + song_id)
            else:
                # Fill to avoid null values of floating point numbers
                self.songs['lyric'].append('ThisShallBeIgnored')
Пример #13
0
 def get_plist(self, url, st):
     # 建立歌单
     content = func.get_page(url).text
     # 新建bs对象
     soup = BeautifulSoup(content, 'lxml')
     plist = soup.find(name='ul', attrs={'class': 'f-hide'})
     # 根据toggle传递csv文件名
     if st.toggle == False:
         st.csv_fname = st.playlist_title = soup.find(name='h2',
                                                      class_='f-ff2').string
     # 筛选数据
     for song in plist.find_all(name='li'):
         # id
         id = re.search('=([0-9]+)', song.a['href'])
         # 避免重复记录歌名
         id_foo = id.group(1)
         if id_foo not in self.songs['id']:
             self.songs['id'].append(id_foo)
             # name
             song_name = song.a.string
             self.songs['name'].append(song_name)
             # url
             song_url = 'https://music.163.com' + song.a['href']
             self.songs['url'].append(song_url)
Пример #14
0
    def get_detail(self):
        self.songs['songer'] = []
        self.songs['fee'] = []
        self.songs['album'] = []
        self.songs['publishTime'] = []
        self.songs['company'] = []
        self.songs['popularity'] = []
        self.songs['duration'] = []
        self.songs['score'] = []

        total = len(self.songs['id'])
        for song_id in self.songs['id']:
            url ='http://music.163.com/api/song/detail/?id='+ song_id \
                 +'&ids=%5B'+song_id+'%5D'
            # Get song detail
            content = func.get_page(url).json()
            name = content['songs'][0]['artists'][0]['name']
            fee = content['songs'][0]['fee']
            album = content['songs'][0]['album']['name']
            publishTime = content['songs'][0]['album']['publishTime']
            company = content['songs'][0]['album']['company']
            popularity = content['songs'][0]['popularity']
            duration = content['songs'][0]['duration']
            score = content['songs'][0]['score']

            if name is not None and name != '':
                self.songs['songer'].append(name)
            else:
                self.songs['songer'].append('UnKown')
            if fee is not None:
                self.songs['fee'].append(fee)
            else:
                self.songs['fee'].append(0)
            if album is not None and album != '':
                self.songs['album'].append(album)
            else:
                self.songs['album'].append('UnKown')
            if publishTime is not None:
                self.songs['publishTime'].append(publishTime)
            else:
                self.songs['publishTime'].append(1568304000000)
            if company is not None and company != '':
                self.songs['company'].append(company)
            else:
                self.songs['company'].append('UnKown')
            if popularity is not None:
                self.songs['popularity'].append(popularity)
            else:
                self.songs['popularity'].append(50)
            if duration is not None:
                self.songs['duration'].append(duration)
            else:
                self.songs['duration'].append(93000)
            if score is not None:
                self.songs['score'].append(score)
            else:
                self.songs['score'].append(50)
            print(
                'completed detail' +
                str(round(self.songs['id'].index(song_id) / total * 100, 2)) +
                '% ',
                end='')
            print('added detail id: ' + song_id)
            time.sleep(random.uniform(1, 2))
Пример #15
0
def list_housing(n):
    '''n表示第几页'''
    rows = get_page(n)
    recommended = recommend()
    return render_template(r'list.html', rows=rows, recommended=recommended)