def close_session(session, first_timestamp_ts): size = len(session) # Creating and artificial session id based on the first click timestamp and a hash of user id first_click = session[0] session_id = (int(first_click['timestamp']) * 100) + hash_str_to_int( first_click['user_id'].encode(), 3) session_hour = int( (first_click['timestamp'] - first_timestamp_ts) / (1000 * 60 * 60)) # Converting timestamp to hours since first timestamp # Converting to Spark DataFrame Rows, to convert RDD back to DataFrame # TODO add 'view' here clicks = list([T.Row(**click) for click in session]) session_dict = { 'session_id': session_id, 'session_hour': session_hour, 'session_size': size, 'session_start': first_click['timestamp'], 'user_id': first_click['user_id'], 'clicks': clicks } session_row = T.Row(**session_dict) return session_row
def test_as_list_shallow(self): input = [pst.Row(id=1, a=pst.Row(b=101))] input_df = self.spark.createDataFrame(input) actual = as_list(input_df, False) expect = [{'id': 1, 'a': pst.Row(b=101)}] self.assertEqual(actual, expect)
def test_as_list_deep(self): input = [pst.Row(id=1, a=pst.Row(b=101))] input_df = self.spark.createDataFrame(input) actual = as_list(input_df, True) expect = [{'id': 1, 'a': {'b': 101}}] self.assertEqual(actual, expect)
def test_as_list_shallow(spark): """as_list converts rows to dictionary without deep""" input = [pst.Row(id=1, a=pst.Row(b=101))] input_df = spark.createDataFrame(input) actual = to_list(input_df, False) expect = [{'id': 1, 'a': pst.Row(b=101)}] assert actual == expect
def test_as_list_deep(spark): """as_list converts rows to dictionary deeply""" input = [pst.Row(id=1, a=pst.Row(b=101))] input_df = spark.createDataFrame(input) actual = to_list(input_df, True) expect = [{'id': 1, 'a': {'b': 101}}] assert actual == expect
def test_as_list_deep_convert_fields(spark): """Given a dataframe with nested structure fields, When as_list with shallow=True is called, Then result is a list of dictionaries with structure field not converted""" input = [(1, pst.Row(first_name='John'))] input_df = spark.createDataFrame(input, ['id', 'person']) actual = as_list(input_df, False) expect = [dict(id=1, person=pst.Row(first_name='John'))] assert actual == expect
def _create_train_image_uris_and_labels(self, repeat_factor=1, cardinality=100, dense=True): image_uris = getSampleImagePaths() * repeat_factor # Create image categorical labels (integer IDs) local_rows = [] for uri in image_uris: label = np.random.randint(low=0, high=cardinality, size=1)[0] if dense: label_inds = np.zeros(cardinality) label_inds[label] = 1.0 label_inds = label_inds.ravel() assert label_inds.shape[0] == cardinality, label_inds.shape one_hot_vec = spla.Vectors.dense(label_inds.tolist()) else: # sparse one_hot_vec = spla.Vectors.sparse(cardinality, {label: 1}) _row_struct = { self.input_col: uri, self.one_hot_col: one_hot_vec, self.one_hot_label_col: float(label) } row = sptyp.Row(**_row_struct) local_rows.append(row) image_uri_df = self.session.createDataFrame(local_rows) return image_uri_df
def appendWeatherData(ts, stID, tsVec, airTemp, cloudCov, precip1Hr): # Returns weather information to be included in taxi trip hrInSec = 3600 # 1 hour = 3600 seconds if not ts == None: deltaTime = ts - tsVec[stID][0] ind = int(round(truediv(deltaTime, hrInSec))) if ind < 0: ind = 0 elif ind >= len(tsVec[stID]): ind = len(tsVec[stID]) - 1 return sqlt.Row('airTemp' , 'cloudCov' , 'pricip1Hr')\ (airTemp[stID][ind], cloudCov[stID][ind], precip1Hr[stID][ind]) else: return sqlt.Row('airTemp', 'cloudCov', 'pricip1Hr')\ (None , None , None)
def fill_not_null_values(elements): ordered_dict = OrderedDict() for element in elements: ordered_dict[element.timestamp] = element bid = None ask = None price = None quantity = None for (key, element) in ordered_dict.items(): if element.bid is not None: bid = element.bid if element.ask is not None: ask = element.ask if element.price is not None: price = element.price if element.quantity is not None: quantity = element.quantity row = T.Row(id=element.id, timestamp=element.timestamp, bid=bid, ask=ask, price=price, quantity=quantity) ordered_dict[key] = row return ordered_dict.values()
def take_log_in_all_columns(row: types.Row): old_row = row.asDict() new_row = { f'log({column_name})': math.log(value) for column_name, value in old_row.items() } return types.Row(**new_row)
def extract_article(row): """ Extract the content of the article. normalize the titles""" # redirect = row.page_redirect_title if row.page_redirect_title is not None else "" return T.Row(pid=row.page_id, title=normalise_title(row.page_title), title_rd=normalise_title(row.page_redirect_title), wikitext=row.revision_text)
def nulls(row: T.Row) -> T.Row: d = row.asDict() _cnt = 0 for _var in d.keys(): if d[_var] is None: _cnt += 1 d['nullcnt'] = _cnt return T.Row(**d)
def test_row(self): first = T.Row(one=1, two=2, three=3, four=4) second = T.Row(three=3, two=2, four=4, one=1) # Spark currently sorts the fields of each row internally # so these will match... SparklyTest().assertRowsEqual(first, second) self.assertEqual(first, second) # but since Rows extend tuples, only the values are checked as # long as the fields define the same alpha order first = T.Row(one=1, two=2, three=3, four=4) second = T.Row(th=3, tw=2, f=4, o=1) # We fix this in our version by default with self.assertRaises(AssertionError): SparklyTest().assertRowsEqual(first, second) self.assertEqual(first, second)
def get_valid_ngrams(row): text = row.chunk found_anchors = [] for n in range(10, 0, -1): ngrams = get_ngrams(text, n) for ng in ngrams: if ng in anchors_keys: found_anchors.append(ng) # text.replace(ng, " @ ") return [T.Row(pid=row.pid, anchor=a) for a in found_anchors]
def _(orig_row): orig_rows = orig_row.rows new_rows = [list(row) for row in orig_rows] for column, (datatype, fn) in columns.items(): fn_rows = fn(orig_rows) for i, orig_row in enumerate(orig_rows): new_rows[i].append(fn_rows[orig_row]) NewRow = pyspark_types.Row(*schema_names) return [NewRow(*row) for row in new_rows]
def test_i_can_fly(self): input = [pst.Row(a=1, b=2)] input_df = self.spark.createDataFrame(input) expect = [{'a': 1}] actual_df = input_df.select("a") actual = as_list(actual_df) self.assertEqual(actual, expect)
def create_spark_dataframe_from_list(label_list): # Create image categorical labels (integer IDs) local_rows = [] for label in label_list: _row_struct = {"label": label } row = sptyp.Row(**_row_struct) local_rows.append(row) dataframe = sqlContext.createDataFrame(local_rows) return dataframe
def encode_authors(actors_str): actors = [a.strip().lower() for a in actors_str.split(",")] ids = [] for a in actors: ids.append(actors_id_dict[a]) ids = sorted(ids) + (4 - len(ids)) * [None] return t.Row("actor_id_0", "actor_id_1", "actor_id_2", "actor_id_3")(*ids)
def generate_vector_df(spark, glove, vocab_df): vector_schema = T.StructType([ T.StructField('id', T.IntegerType(), True), T.StructField('vector', T.ArrayType(T.DoubleType()), True) ]) vector_df = spark.sparkContext \ .parallelize([(i, [float(d) for d in glove.word_vectors[i]]) for i in range(len(glove.word_vectors))]) \ .map(lambda t: T.Row(id=t[0], vector=t[1])).toDF(vector_schema) return vector_df
def get_sentiment(tweet="Default"): """ udf to return the sentiment of the tweet. :return: -1 0 1 for Negative, Neutral and Positive sentiment. """ tweet_json = tweet return t.Row('id', 'full_text', 'len', 'in_reply_to_status_id', 'date', 'source', 'likes', 'retweet', 'sent_by', 'friend_of', 'hash_tag') \ (tweet_json['id'], tweet_json['full_text'], tweet_json['len'], tweet_json['in_reply_to_status_id'], tweet_json['date'], tweet_json['source'], tweet_json['likes'], tweet_json['retweet'], tweet_json['sent_by'], tweet_json['friend_of'], tweet_json['hash_tag'])
def get_links(page): links = [] for m in links_regex.findall(page.wikitext): link = normalise_title(m[0]) anchor = m[1] if len(m) > 1 and len(m[1]) > 0 else link if len(link) > 0: links.append( T.Row(pid=page.pid, title=page.title, link=link, anchor=normalise_anchor(anchor))) return links
def mse(row: T.Row) -> T.Row: d = row.asDict() _mse = 0.0 if d['Sales_Pred'] is None: print("'Sales_Pred'=None") _mse = 0 elif d['sales'] is None: _mse = d['Sales_Pred']**2 else: _mse = (d['Sales_Pred'] - d['sales'])**2 d['mse'] = _mse return T.Row(**d)
def tag_ori(rg, oi, user_orient): """ 用户画像标签解析 """ tag_set = set() try: province = "0" city = "0" if rg is not None and "_" in rg: province, city = rg.split("_", 1) if oi is not None: tag_set.add("19_%s" % oi) if user_orient not in [None, '']: for tag in user_orient.split(","): if "_" in tag: prefix = tag.split("_", 1)[0] if prefix in ["12", "1", "13", "14", "15", "16", "17", "8"]: tag_set.add(tag) return T.Row('province', 'city', 'tags')(province, city, ",".join(sorted(tag_set, key=lambda x: map(int, x.split("_"))))) except Exception as err: return T.Row('province', 'city', 'tags')("", "", "")
def CreateTrainImageUriandLabels(image_uris, label, label_name, cardinality, isDefault): # Create image categorical labels (integer IDs) local_rows = [] for uri in image_uris: label_inds = np.zeros(cardinality) label_inds[label] = 1.0 one_hot_vec = spla.Vectors.dense(label_inds.tolist()) _row_struct = {"uri": uri, "one_hot_label": one_hot_vec, "label": int(label), "label_name": str(label_name), "isDefault": int(isDefault)} row = sptyp.Row(**_row_struct) local_rows.append(row) image_uri_df = sqlContext.createDataFrame(local_rows) return image_uri_df
def omdb_data(arguments): movie_name, year = arguments client = OMDBClient(apikey=OMDB_API_KEY) try: result = client.get(title=movie_name, year=year, fullplot=True, tomatoes=True) except HTTPError as e: print(e) client.set_default("apikey", OMDB_API_KEY_fallback) result = client.get(title=movie_name, year=year, fullplot=True, tomatoes=True) result_to_keep = {} for key in requested_flat_fields: result_to_keep[key] = result.get(key, None) for nested_field in requested_nested_fields: requested_nested_list = requested_nested_fields[nested_field] nested_list = result.get(nested_field, None) if nested_list: for nested_dict in nested_list: source = nested_dict.get("source", None) if source: value = nested_dict.get("value", None) if source in requested_nested_list: source_formatted = to_snake_case(source) key = f"{nested_field}_{source_formatted}" result_to_keep[key] = value requested_sources = requested_nested_fields[nested_field] for requested_source in requested_sources: source_formatted = to_snake_case(requested_source) key = f"{nested_field}_{source_formatted}" if not key in result_to_keep: result_to_keep[key] = None else: requested_sources = requested_nested_fields[nested_field] for requested_source in requested_sources: source_formatted = to_snake_case(requested_source) key = f"{nested_field}_{source_formatted}" result_to_keep[key] = None return t.Row(*list(result_to_keep.keys()))(*list(result_to_keep.values()))
def get_plain_text_without_links(row): """ Replace the links with a dot to interrupt the sentence and get the plain text """ wikicode = row.wikitext wikicode_without_links = re.sub(links_regex, '.', wikicode) wikicode_without_links = re.sub(references_regex, '.', wikicode_without_links) ## we dont have mwparserfromhell on the spark-cluster yet try: text = mwparserfromhell.parse(wikicode_without_links).strip_code() except: text = wikicode_without_links text = wikicode_without_links return T.Row(pid=row.pid, title=normalise_title(row.title), text=text.lower())
def stats_from_id(video_id): if not video_id: return None, None, None youtube = youtube_utils.get_authenticated_service( api_service_name, api_version, scopes, n_tries=0 ) n_tries = 0 success = False while not success and n_tries < 19: try: request = youtube.videos().list(part="statistics", id=video_id) response = request.execute() success = True except HttpError as e: n_tries += 1 youtube = youtube_utils.get_authenticated_service( api_service_name, api_version, scopes, n_tries=n_tries, ) if not success: return None, None, None try: stats = response["items"][0]["statistics"] view_count = int(stats["viewCount"]) like_count = int(stats["likeCount"]) dislike_count = int(stats["dislikeCount"]) engagement_score = (like_count + dislike_count) / view_count positive_engagement_score = like_count / dislike_count except KeyError as e: return None, None, None return t.Row( "youtube_view_count", "youtube_engagement_score", "youtube_positive_engagement_score", )(view_count, engagement_score, positive_engagement_score)
def compute_tweet_sentiment(msg): parameters = {'tweet': msg} r = requests.get(url=SENTIMENT_SERVER_URL, params=parameters) sentiment = 1 psentiment = 0 ngsentiment = 0 nsentiment = 0 nltk_sentiment = 1 nltk_psentiment = 0 nltk_ngsentiment = 0 nltk_nsentiment = 0 if r.status_code == 200: data = r.json() sentiment = data['Sentiment'] nltk_sentiment = data['Sentiment_nltk'] if sentiment == 0: ngsentiment = 1 elif sentiment == 1: nsentiment = 1 elif sentiment == 2: psentiment = 1 if nltk_sentiment == 0: nltk_ngsentiment = 1 elif nltk_sentiment == 1: nltk_nsentiment = 1 elif nltk_psentiment == 2: nltk_psentiment = 1 print(data) return t.Row('sentiment', 'psentiment', 'ngsentiment', 'nsentiment', 'nltk_sentiment', 'nltk_psentiment', 'nltk_ngsentiment', 'nltk_nsentiment')(sentiment, psentiment, ngsentiment, nsentiment, nltk_sentiment, nltk_psentiment, nltk_ngsentiment, nltk_nsentiment)
## main namespace .where(F.col('page_namespace') == 0) ## no redirect-pages # .where(F.col('page_redirect_title')=='') .where(F.col('revision_text').isNotNull() ).where(F.length(F.col('revision_text')) > 0)) ## extracting pid, title, title_rd, and the wikitext ## titles are normalized wikipedia = spark.createDataFrame( wikipedia_all.rdd.map(extract_article).filter(lambda r: r is not None)) ## only redirects redirects = spark.createDataFrame( wikipedia.where(F.col('title_rd') != '').rdd.map( lambda r: T.Row(title_from=r.title, title_to=r.title_rd))).distinct() ## only articles (no redirect title) articles = (wikipedia.where(F.col('title_rd') == '').select( 'pid', 'title', 'wikitext')) ## extract the links links = spark.createDataFrame(articles.rdd.flatMap(get_links)) links_resolved = ( links.join( redirects, links['link'] == redirects['title_from'], how='leftouter').select( 'pid', 'title', 'anchor',
def get_chunks(row): return [ T.Row(pid=row.pid, chunk=blocks.strip()) for blocks in re.split('[\n\.,;:()!"]', row.text) if len(blocks.strip()) > 0 ]