Пример #1
0
def gthread_post(soup):  # {{{
    """
    @return dataframe thread_id & all post_ids
    """
    #  [re.sub(r"#(q|p)", "/", s["href"].replace("thread/", "") for s in ...]
    df = ([
        s["href"] | p(re.sub, r"#(q|p)", "/", px) | px.replace("thread/", "")
        for s in soup.select("span.postNum a")
    ]
          | p(uniset)
          | p(map, px.split("/"))
          | p(filter, px[0] != px[1])
          #  | p(lambda l: list(zip(*l)))
          | p(pd.DataFrame, columns=['thread', 'post']))
    #  ff = list(zip(thread_id, post_id))
    #  df = pd.DataFrame({'thread':thread_id, 'post':post_id})

    df = df.groupby('thread')['post'].apply(np.array).reset_index(name='post')
    post_df = (pd.DataFrame(df['post'].to_list())
               | px.rename(columns=lambda x: 'post_' + str(x))
               | px.fillna(np.nan))
    df = pd.concat([df, post_df], axis=1).drop('post', axis=1)
    thread_post_dict = df.set_index('thread').T.to_dict('list')

    return df, thread_post_dict  # }}}
Пример #2
0
def gimg_size(soup):  # {{{
    """
    @return image size in kilobytes
    """
    img_size = []
    for s in soup.select(".fileText"):
        try:
            img_size.append(
                re.search(r"(?<=\()\d+\s(K|M)B", s.get_text()).group())
        except AttributeError:
            img_size.append(re.search(r"(?<=\()\d+\s(K|M)B", s.get_text()))
    return img_size | p(rm_na, '0 KB') | p(mb2kb)  # }}} much more elegant IMO
def unzip_files(source_dir):
    files = glob.glob(join(source_dir, "*.zip"))

    for f in files:
        filepath = os.path.splitext(f)[0]  # unzipped path

        if not exists(filepath):
            file = filepath | p(basename)
            path = filepath | p(dirname)

            ZipFile(f).extract(member=file, path=path)

        if exists(f):
            os.remove(f)
Пример #4
0
def gimg_dim(soup):  # {{{
    """
    Return width, height of image
    """
    img_dim = []
    for s in soup.select(".fileText"):
        try:
            img_dim.append(re.search(r"(?<=,\s)\d+x\d+", s.get_text()).group())
        except AttributeError:
            img_dim.append(re.search(r"(?<=,\s)\d+x\d+", s.get_text()))

    return [x.split("x") for x in img_dim] | p(lambda l: list(zip(*l)))  # }}}
def reqHwpFileDown(inHwpFile, inCaSvephy):

    prefixUrl = 'https://gnews.gg.go.kr/Operator/reporter_room/notice/download.do?'

    reqHwpUrl = ('{}file={}&BS_CODE=s017&CA_SAVEPHY={}'.format(
        prefixUrl, inHwpFile, inCaSvephy)
                 | p(parse.urlparse).query
                 | p(parse.parse_qs)
                 | p(parse.urlencode, doseq=True)
                 | prefixUrl + px)

    saveHwpFile = '{}/{}/{}'.format(globalVar['outPath'], serviceName,
                                    inHwpFile)

    # 디렉터리 없을 시 생성
    if not os.path.exists(os.path.dirname(saveHwpFile)):
        os.makedirs(os.path.dirname(saveHwpFile))

    # 파일 존재 유무 판단
    isFile = os.path.exists(saveHwpFile)

    # if isFile: return Pa

    res = urllib.request.urlopen(reqHwpUrl)
    resCode = res.getcode()
    resSize = int(res.headers['content-length'])

    if resCode != 200:
        return False

    if resSize < 82:
        return False

    with open(saveHwpFile, mode="wb") as f:
        f.write(res.read())

    log.info('[CHECK] saveHwpFile : {} / {} / {}'.format(
        inCaSvephy, isFile, saveHwpFile))

    return True
def main():
    # Current Locatoins
    current_loc = get_current_loc()

    # CSUF Loaction
    # current_loc = [33.8829, -117.8869]

    data_ = 'simiulated_data_stream.txt' | p(read_file) | p(generate_nodes)

    for x in data_:
        print('New Incoming Orders:\n-------------------')

        print(
            'Request ID:\t{}\nUser ID:\t{}\nRestaurant:\t{}\nItem:\t\t{}\nPrice:\t\t${}\n'
            .format(x['request_id'], x['user_id'],
                    x['payload']['restaurant_name'],
                    x['payload']['item']['name'],
                    x['payload']['item']['price']))
        print('\nAnalyzing . . .\n\n')
        print('Recommendation:')
        recommended_res = x['payload']['restaurant_name'] | p(
            get_restaurant_alias, px) | p(get_recommendation, px,
                                          current_loc[0], current_loc[1])
        recommended_menu = recommended_res['name'] | p(
            get_restaurant_id, px, current_loc[0], current_loc[1]) | p(
                get_restaurant_menu, px)

        print('Restaurant:\t{}\nItem:\t\t{}\n'.format(
            recommended_res['name'], recommended_menu['menu_item_name']))
Пример #7
0
def test_list_creation():
    assert (2 | p([1, px, px + 1])) == [1, 2, 3]
Пример #8
0
def test_set_creation():
    assert (2 | p({1, px, px + 1})) == {1, 2, 3}
Пример #9
0
def test_dict_creation():
    assert (2 | p({1: px, px: 3, px + 1: px + 2, 4: 5})) == {i: i + 1 for i in range(1, 5)}
Пример #10
0
def _format_dishes(dishes: list) -> list:
    return (dishes
            | p(map, lambda x: x.get_text())
            | p(map, _remove_extra_whitespace)
            | p(map, lambda x: x.rstrip())
            | p(list))
Пример #11
0
from sspipe import p, px

pinc = p(lambda x: x + 1)
pcdr = p(lambda x: x[1:])


def test_level1():
    cases = [
        [True, 1 not in [1, 2] | pcdr],
        [True, 2 | pinc > 2],
        [False, 2 | pinc < 3],
        [1, 2 | pinc & 5],
        [7, 2 | (pinc | 4)],
        [1 | pinc + 2, 4],
        # TODO: write test for rest
    ]
    for expected, result in cases:
        assert expected == result


def test_level2():
    result = 1 | (px == px)
    assert result == True


def test_divide():
    pipeline = 1 / px
    assert (2 | pipeline) == 0.5

    pipeline = (px + 1) / (px + 2)
    assert (2 | pipeline) == 0.75
Пример #12
0
def test_map_filter():
    assert range(3) | p(filter, px % 2 == 0) | p(
        map, px + 1) | p(list) | (px == [1, 3])
Пример #13
0
 def plmap(func):
     return p(lambda x: map(func, x)) | p(list)
Пример #14
0
def test_rhs():
    assert np.array([1, 2]) | p(lambda x: x.sum()) | (px == 3)
Пример #15
0
def test_lhs():
    assert 2 | p(np.log2) | (px == 1)
Пример #16
0
def get_data(soup):
    pattern = re.compile(r"\([a-z0-9.\-]+[.](\w+)\)")

    try:
        title_ = [
            x.get_text(' ', strip=True)
            for x in soup.select("h2 span.story-title")
        ]
        title = [pattern.sub('', x).strip() for x in title_]
    except:
        title = ""

    try:
        dated = [
            d.get_text(" ", strip=True)
            for d in soup.select("span.story-byline time")
        ]
        date = [re.sub("on|@", "", x).strip() for x in dated]
    except:
        date = ""

    try:
        curls = {}
        ex = [
            x.get_text(' ', strip=True)
            for x in soup.select("h2 span.story-title")
        ]

        for idx, u in enumerate(ex):
            if not pattern.search(u):
                curls[idx] = "Empty"
            else:
                curls[idx] = pattern.search(u).group()

        elink = list(curls.values())
    except:
        try:
            elink = [l.text.strip() for l in soup.select("h2 span span.no")]
        except:
            elink = ""

    try:
        comments = (
            [x.get_text() for x in soup.select("span.comment-bubble a")]
            | p(np.array)
            | px.astype("int"))
    except:
        comments = ""

    try:
        cat = ([b.get("alt") for b in soup.find_all("img")]
               | p(list, p(filter, None, px)))
        category = ([x.replace("Icon", "") for x in cat]
                    | p(filter, None)
                    | p(list))
    except:
        category = ""

    try:
        user = [
            u.get_text(" ", strip=True).replace("\n", "").replace("\t", "")
            for u in soup.select("span.story-byline")
        ]
        user = [
            " ".join(a.split())
            | p(re.findall, r"Postedby\s(\w+)", px) for a in user
        ]
    except:
        user = ""

    try:
        pop = [
            re.findall("'([a-zA-Z0-9,\s]*)'", prop["onclick"]) | px[1]
            for prop in soup.find_all("span", attrs={"alt": "Popularity"})
        ]
    except:
        pop = ""

    temp = pd.DataFrame({
        "title": title,
        "date": date,
        "exlink": elink,
        "comments": comments,
        "category": category,
        "user": user,
        "popular": pop
    })
    return temp
Пример #17
0
def test_simple():
    def f(x, y):
        return x * y

    result = 1 | p(f, px + 1, px + 2)
    assert result == 6
Пример #18
0
    j for i in soup.find_all("span", class_="story-title") for j in i.find("a")
]

# date/time of post
# import dateutil.parser
# dateutil.parser.parse('string')
dated = [d.get_text() for d in soup.select("span.story-byline time")]
date = [re.sub("on|@", "", x).strip() for x in dated]
dt = [dt.strptime(d, "%A %B %d, %Y %I:%M%p") for d in date]

# external link to post
elink = [l.text.strip() for l in soup.select("h2 span span")]

# comments on post
comments = ([x.get_text() for x in soup.select("span.comment-bubble a")]
            | p(np.array)
            | px.astype("int"))

# category of post
cat = [b.get("alt")
       for b in soup.find_all("img")] | p(list, p(filter, None, px))
# Using this sort of as a try except in case it were to not exist
category = [x.replace("Icon", "") for x in cat] | p(filter, None) | p(list)

# user who made the post
user = [
    u.get_text(" ", strip=True).replace("\n", "").replace("\t", "")
    for u in soup.select("span.story-byline")
]
user = [
    " ".join(a.split()) | p(re.findall, r"Postedby\s(\w+)", px) for a in user
Пример #19
0
def test_pipe_args():
    def f(x, y):
        return x * y

    assert (1 | p(f, px + 1, px + 2)) == 6
Пример #20
0
def test_normal_args():
    assert (1 | p('{}{}{x}'.format, 2, x=3)) == '123'
Пример #21
0
def test_tuple_creation():
    assert (2 | p((1, px, px + 1))) == (1, 2, 3)
Пример #22
0
def test_integration_with_px():
    assert range(3) | p.select(px + 1) | p(list) | (px == [1, 2, 3])
Пример #23
0
def test_plmap():
    result = [1, 2] | p(map, lambda x: x + 1, px) | p(list)
    assert result == [2, 3]
Пример #24
0
def test_scalar_rhs():
    assert np.int32(1) | p(lambda x: x + 1) | (px == 2)
Пример #25
0
def test_pd_series():
    result = pd.Series([1, 2]) | p(list)
    assert result == [1, 2]
Пример #26
0
def download_data(directory):

    if dirname(directory) != 'data':
        directory = join(directory, 'data')

    if not exists(directory):
        print(f'{directory} not found. Creating..')
        os.makedirs(directory)

    datasets = [
        KaggleData(
            name="aisles",
            competition="instacart-market-basket-analysis",
            source_filename="aisles.csv.zip",
            destination_dir=directory,
        ),
        KaggleData(
            name="orders",
            competition="instacart-market-basket-analysis",
            source_filename="orders.csv.zip",
            destination_dir=directory,
        ),
        KaggleData(
            name="departments",
            competition="instacart-market-basket-analysis",
            source_filename="departments.csv.zip",
            destination_dir=directory,
        ),
        KaggleData(
            name="products",
            competition="instacart-market-basket-analysis",
            source_filename="products.csv.zip",
            destination_dir=directory,
        ),
        KaggleData(
            name="order-products-train",
            competition="instacart-market-basket-analysis",
            source_filename="order_products__train.csv.zip",
            destination_dir=directory,
        ),
        KaggleData(
            name="order-products-prior",
            competition="instacart-market-basket-analysis",
            source_filename="order_products__prior.csv.zip",
            destination_dir=directory,
        ),
        KaggleData(
            name="sample-submission",
            competition="instacart-market-basket-analysis",
            source_filename="sample_submission.csv.zip",
            destination_dir=directory,
        ),
    ]

    for source in datasets:
        dest = join(source.destination_dir, source.source_filename)
        if (not dest | p(exists)) and (not splitext(dest)[0] | p(exists)):
            subprocess.run(source.cli_download_command())
        else:
            # To do log warning (do not print)
            pass

    unzip_files(directory)

    results = {}

    for d in datasets:
        path = d.get_path()

        if splitext(path)[1] == '.zip':
            path = splitext(path)[0]

        results[d.name] = path

    return results
Пример #27
0
def test_divide_fallback():
    assert (dict(x=2, y=3).keys() / p(list) | p(set)) == {'x', 'y'}
    assert (dict(x=2, y=3).values() / p(list) | p(set)) == {2, 3}
Пример #28
0
def test_simple():
    assert range(3) | p.select(lambda x: x + 1) | p(list) | (px == [1, 2, 3])