def gthread_post(soup): # {{{ """ @return dataframe thread_id & all post_ids """ # [re.sub(r"#(q|p)", "/", s["href"].replace("thread/", "") for s in ...] df = ([ s["href"] | p(re.sub, r"#(q|p)", "/", px) | px.replace("thread/", "") for s in soup.select("span.postNum a") ] | p(uniset) | p(map, px.split("/")) | p(filter, px[0] != px[1]) # | p(lambda l: list(zip(*l))) | p(pd.DataFrame, columns=['thread', 'post'])) # ff = list(zip(thread_id, post_id)) # df = pd.DataFrame({'thread':thread_id, 'post':post_id}) df = df.groupby('thread')['post'].apply(np.array).reset_index(name='post') post_df = (pd.DataFrame(df['post'].to_list()) | px.rename(columns=lambda x: 'post_' + str(x)) | px.fillna(np.nan)) df = pd.concat([df, post_df], axis=1).drop('post', axis=1) thread_post_dict = df.set_index('thread').T.to_dict('list') return df, thread_post_dict # }}}
def gimg_size(soup): # {{{ """ @return image size in kilobytes """ img_size = [] for s in soup.select(".fileText"): try: img_size.append( re.search(r"(?<=\()\d+\s(K|M)B", s.get_text()).group()) except AttributeError: img_size.append(re.search(r"(?<=\()\d+\s(K|M)B", s.get_text())) return img_size | p(rm_na, '0 KB') | p(mb2kb) # }}} much more elegant IMO
def unzip_files(source_dir): files = glob.glob(join(source_dir, "*.zip")) for f in files: filepath = os.path.splitext(f)[0] # unzipped path if not exists(filepath): file = filepath | p(basename) path = filepath | p(dirname) ZipFile(f).extract(member=file, path=path) if exists(f): os.remove(f)
def gimg_dim(soup): # {{{ """ Return width, height of image """ img_dim = [] for s in soup.select(".fileText"): try: img_dim.append(re.search(r"(?<=,\s)\d+x\d+", s.get_text()).group()) except AttributeError: img_dim.append(re.search(r"(?<=,\s)\d+x\d+", s.get_text())) return [x.split("x") for x in img_dim] | p(lambda l: list(zip(*l))) # }}}
def reqHwpFileDown(inHwpFile, inCaSvephy): prefixUrl = 'https://gnews.gg.go.kr/Operator/reporter_room/notice/download.do?' reqHwpUrl = ('{}file={}&BS_CODE=s017&CA_SAVEPHY={}'.format( prefixUrl, inHwpFile, inCaSvephy) | p(parse.urlparse).query | p(parse.parse_qs) | p(parse.urlencode, doseq=True) | prefixUrl + px) saveHwpFile = '{}/{}/{}'.format(globalVar['outPath'], serviceName, inHwpFile) # 디렉터리 없을 시 생성 if not os.path.exists(os.path.dirname(saveHwpFile)): os.makedirs(os.path.dirname(saveHwpFile)) # 파일 존재 유무 판단 isFile = os.path.exists(saveHwpFile) # if isFile: return Pa res = urllib.request.urlopen(reqHwpUrl) resCode = res.getcode() resSize = int(res.headers['content-length']) if resCode != 200: return False if resSize < 82: return False with open(saveHwpFile, mode="wb") as f: f.write(res.read()) log.info('[CHECK] saveHwpFile : {} / {} / {}'.format( inCaSvephy, isFile, saveHwpFile)) return True
def main(): # Current Locatoins current_loc = get_current_loc() # CSUF Loaction # current_loc = [33.8829, -117.8869] data_ = 'simiulated_data_stream.txt' | p(read_file) | p(generate_nodes) for x in data_: print('New Incoming Orders:\n-------------------') print( 'Request ID:\t{}\nUser ID:\t{}\nRestaurant:\t{}\nItem:\t\t{}\nPrice:\t\t${}\n' .format(x['request_id'], x['user_id'], x['payload']['restaurant_name'], x['payload']['item']['name'], x['payload']['item']['price'])) print('\nAnalyzing . . .\n\n') print('Recommendation:') recommended_res = x['payload']['restaurant_name'] | p( get_restaurant_alias, px) | p(get_recommendation, px, current_loc[0], current_loc[1]) recommended_menu = recommended_res['name'] | p( get_restaurant_id, px, current_loc[0], current_loc[1]) | p( get_restaurant_menu, px) print('Restaurant:\t{}\nItem:\t\t{}\n'.format( recommended_res['name'], recommended_menu['menu_item_name']))
def test_list_creation(): assert (2 | p([1, px, px + 1])) == [1, 2, 3]
def test_set_creation(): assert (2 | p({1, px, px + 1})) == {1, 2, 3}
def test_dict_creation(): assert (2 | p({1: px, px: 3, px + 1: px + 2, 4: 5})) == {i: i + 1 for i in range(1, 5)}
def _format_dishes(dishes: list) -> list: return (dishes | p(map, lambda x: x.get_text()) | p(map, _remove_extra_whitespace) | p(map, lambda x: x.rstrip()) | p(list))
from sspipe import p, px pinc = p(lambda x: x + 1) pcdr = p(lambda x: x[1:]) def test_level1(): cases = [ [True, 1 not in [1, 2] | pcdr], [True, 2 | pinc > 2], [False, 2 | pinc < 3], [1, 2 | pinc & 5], [7, 2 | (pinc | 4)], [1 | pinc + 2, 4], # TODO: write test for rest ] for expected, result in cases: assert expected == result def test_level2(): result = 1 | (px == px) assert result == True def test_divide(): pipeline = 1 / px assert (2 | pipeline) == 0.5 pipeline = (px + 1) / (px + 2) assert (2 | pipeline) == 0.75
def test_map_filter(): assert range(3) | p(filter, px % 2 == 0) | p( map, px + 1) | p(list) | (px == [1, 3])
def plmap(func): return p(lambda x: map(func, x)) | p(list)
def test_rhs(): assert np.array([1, 2]) | p(lambda x: x.sum()) | (px == 3)
def test_lhs(): assert 2 | p(np.log2) | (px == 1)
def get_data(soup): pattern = re.compile(r"\([a-z0-9.\-]+[.](\w+)\)") try: title_ = [ x.get_text(' ', strip=True) for x in soup.select("h2 span.story-title") ] title = [pattern.sub('', x).strip() for x in title_] except: title = "" try: dated = [ d.get_text(" ", strip=True) for d in soup.select("span.story-byline time") ] date = [re.sub("on|@", "", x).strip() for x in dated] except: date = "" try: curls = {} ex = [ x.get_text(' ', strip=True) for x in soup.select("h2 span.story-title") ] for idx, u in enumerate(ex): if not pattern.search(u): curls[idx] = "Empty" else: curls[idx] = pattern.search(u).group() elink = list(curls.values()) except: try: elink = [l.text.strip() for l in soup.select("h2 span span.no")] except: elink = "" try: comments = ( [x.get_text() for x in soup.select("span.comment-bubble a")] | p(np.array) | px.astype("int")) except: comments = "" try: cat = ([b.get("alt") for b in soup.find_all("img")] | p(list, p(filter, None, px))) category = ([x.replace("Icon", "") for x in cat] | p(filter, None) | p(list)) except: category = "" try: user = [ u.get_text(" ", strip=True).replace("\n", "").replace("\t", "") for u in soup.select("span.story-byline") ] user = [ " ".join(a.split()) | p(re.findall, r"Postedby\s(\w+)", px) for a in user ] except: user = "" try: pop = [ re.findall("'([a-zA-Z0-9,\s]*)'", prop["onclick"]) | px[1] for prop in soup.find_all("span", attrs={"alt": "Popularity"}) ] except: pop = "" temp = pd.DataFrame({ "title": title, "date": date, "exlink": elink, "comments": comments, "category": category, "user": user, "popular": pop }) return temp
def test_simple(): def f(x, y): return x * y result = 1 | p(f, px + 1, px + 2) assert result == 6
j for i in soup.find_all("span", class_="story-title") for j in i.find("a") ] # date/time of post # import dateutil.parser # dateutil.parser.parse('string') dated = [d.get_text() for d in soup.select("span.story-byline time")] date = [re.sub("on|@", "", x).strip() for x in dated] dt = [dt.strptime(d, "%A %B %d, %Y %I:%M%p") for d in date] # external link to post elink = [l.text.strip() for l in soup.select("h2 span span")] # comments on post comments = ([x.get_text() for x in soup.select("span.comment-bubble a")] | p(np.array) | px.astype("int")) # category of post cat = [b.get("alt") for b in soup.find_all("img")] | p(list, p(filter, None, px)) # Using this sort of as a try except in case it were to not exist category = [x.replace("Icon", "") for x in cat] | p(filter, None) | p(list) # user who made the post user = [ u.get_text(" ", strip=True).replace("\n", "").replace("\t", "") for u in soup.select("span.story-byline") ] user = [ " ".join(a.split()) | p(re.findall, r"Postedby\s(\w+)", px) for a in user
def test_pipe_args(): def f(x, y): return x * y assert (1 | p(f, px + 1, px + 2)) == 6
def test_normal_args(): assert (1 | p('{}{}{x}'.format, 2, x=3)) == '123'
def test_tuple_creation(): assert (2 | p((1, px, px + 1))) == (1, 2, 3)
def test_integration_with_px(): assert range(3) | p.select(px + 1) | p(list) | (px == [1, 2, 3])
def test_plmap(): result = [1, 2] | p(map, lambda x: x + 1, px) | p(list) assert result == [2, 3]
def test_scalar_rhs(): assert np.int32(1) | p(lambda x: x + 1) | (px == 2)
def test_pd_series(): result = pd.Series([1, 2]) | p(list) assert result == [1, 2]
def download_data(directory): if dirname(directory) != 'data': directory = join(directory, 'data') if not exists(directory): print(f'{directory} not found. Creating..') os.makedirs(directory) datasets = [ KaggleData( name="aisles", competition="instacart-market-basket-analysis", source_filename="aisles.csv.zip", destination_dir=directory, ), KaggleData( name="orders", competition="instacart-market-basket-analysis", source_filename="orders.csv.zip", destination_dir=directory, ), KaggleData( name="departments", competition="instacart-market-basket-analysis", source_filename="departments.csv.zip", destination_dir=directory, ), KaggleData( name="products", competition="instacart-market-basket-analysis", source_filename="products.csv.zip", destination_dir=directory, ), KaggleData( name="order-products-train", competition="instacart-market-basket-analysis", source_filename="order_products__train.csv.zip", destination_dir=directory, ), KaggleData( name="order-products-prior", competition="instacart-market-basket-analysis", source_filename="order_products__prior.csv.zip", destination_dir=directory, ), KaggleData( name="sample-submission", competition="instacart-market-basket-analysis", source_filename="sample_submission.csv.zip", destination_dir=directory, ), ] for source in datasets: dest = join(source.destination_dir, source.source_filename) if (not dest | p(exists)) and (not splitext(dest)[0] | p(exists)): subprocess.run(source.cli_download_command()) else: # To do log warning (do not print) pass unzip_files(directory) results = {} for d in datasets: path = d.get_path() if splitext(path)[1] == '.zip': path = splitext(path)[0] results[d.name] = path return results
def test_divide_fallback(): assert (dict(x=2, y=3).keys() / p(list) | p(set)) == {'x', 'y'} assert (dict(x=2, y=3).values() / p(list) | p(set)) == {2, 3}
def test_simple(): assert range(3) | p.select(lambda x: x + 1) | p(list) | (px == [1, 2, 3])