def groupby_range(df, colgroupby, colrange, show_progress=False): """ Group the DataFrame and find the range between the smallest and largest value for each group. Parameters ---------- :param df: DataFrame The DataFrame. :param colgroupby: str The column to groupby. :param colrange: str The column to find the range of values. :param show_progress: bool or str, default False If True, display a progress bar for the range. If str, the name of the progress bar to display. Returns ---------- DataFrame DataFrame with two columns: colgroupby, colrange+`Range` """ desc = '' if isinstance(show_progress, str): desc = show_progress # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc=desc, disable=not show_progress) newname_dict = zip2dict([str(colrange), '0'], [str(colrange) + 'Range'] * 2) return df.groupby(colgroupby, sort=False)[colrange].progress_apply( lambda x: x.max() - x.min()).to_frame().reset_index().rename( columns=newname_dict)
def compute_hindex(df, colgroupby, colcountby, show_progress=False): """ Calculate the h index for each group in the DataFrame. See :cite:`hirsch2005index` for the definition. The algorithmic implementation for each author can be found in :py:func:`citationanalysis.author_hindex`. Parameters ---------- :param df : DataFrame A DataFrame with the citation information for each Author. :param colgroupby : str The DataFrame column with Author Ids. :param colcountby : str The DataFrame column with Citation counts for each publication. Returns ------- DataFrame DataFrame with 2 columns: colgroupby, 'Hindex' """ # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc='Hindex', disable=not show_progress) newname_dict = zip2dict([str(colcountby), '0'], [str(colgroupby) + 'Hindex'] * 2) return df.groupby(colgroupby, sort=False)[colcountby].progress_apply( author_hindex).to_frame().reset_index().rename(columns=newname_dict)
def groupby_mean(df, colgroupby, colcountby, show_progress=False): """ Group the DataFrame and find the mean of the column. Parameters ---------- :param df: DataFrame The DataFrame. :param colgroupby: str The column to groupby. :param colcountby: str The column to find the mean of values. :param show_progress: bool or str, default False If True, display a progress bar for the summation. If str, the name of the progress bar to display. Returns ---------- DataFrame DataFrame with two columns: colgroupby, colcountby+'Mean' """ desc = '' if isinstance(show_progress, str): desc = show_progress # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc=desc, disable=not show_progress) newname_dict = zip2dict([str(colcountby), '0'], [str(colcountby) + 'Mean'] * 2) return df.groupby(colgroupby, sort=False)[colrange].progress_apply( lambda x: x.mean()).to_frame().reset_index().rename( columns=newname_dict)
def compute_sleepingbeauty(df, colgroupby, colcountby, show_progress=False): """ Calculate the sleeping beauty and awakening time for each group in the DataFrame. See :cite:`ke2015beauty` for details. The algorithmic implementation for each publication can be found in :py:func:`sleepingbeauty.beauty_coefficient`. Parameters ---------- df : DataFrame A DataFrame with the citation information for each Author. colgroupby : str The DataFrame column with Author Ids. colcountby : str The DataFrame column with Citation counts for each publication. Returns ------- DataFrame DataFrame with 3 columns: colgroupby, 'Beauty' and 'Awakening' """ # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc='Beauty', disable=not show_progress) newname_dict = zip2dict([str(colcountby), '0', '1'], [str(colgroupby) + 'Beauty'] * 2 + ['Awakening']) return df.groupby(colgroupby, sort=False)[colcountby].progress_apply( beauty_coefficient).to_frame().reset_index().rename( columns=newname_dict)
def groupby_zero_col(df, colgroupby, colrange, show_progress=False): """ Group the DataFrame and shift the column so the minimum value is 0. Parameters ---------- :param df: DataFrame The DataFrame. :param colgroupby: str The column to groupby. :param colrange: str The column to find the range of values. :param show_progress: bool or str, default False If True, display a progress bar. If str, the name of the progress bar to display. Returns ---------- DataFrame DataFrame with two columns: colgroupby, colrange """ desc = '' if isinstance(show_progress, str): desc = show_progress # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc=desc, disable=not show_progress) return df.groupby( colgroupby, sort=False)[colrange].progress_transform(lambda x: x - x.min())
def __init__(self, embedding_cache, max_length=512, padding_side="right", pad_value=0, trunc_side="random"): assert Path(embedding_cache).exists(), "embedding cache file doesn't exist, need to run preprocessing.py" with open(embedding_cache, "rb") as pickle_in: print("loading the cache embedding data from pickle...") cache_df = pickle.load(pickle_in) # Adjust this part if want to try other emmbedding self.review_df = cache_df.loc[cache_df["contact_embed"] != '[]'] partial_pad_trunc = partial(pad_trunc_sequences, max_length=max_length, padding_side=padding_side, pad_value=pad_value, trunc_side=trunc_side) tqdm.pandas(desc="Padding and truncating hmd and head customer embedding...") review_input_df = self.review_df["contact_embed"].progress_apply(lambda x: partial_pad_trunc(x)) self.review_input_ids = np.array([item[0] for item in review_input_df.values], dtype=np.long) self.review_attention_mask = np.array([item[1] for item in review_input_df.values], dtype=np.bool) self.review_token_type_ids = np.array([item[2] for item in review_input_df.values], dtype=np.long) tqdm.pandas(desc="Padding and truncating agent embedding...") # change this part if want to use other embed # self.review_df['asic_sic_embed'] = self.review_df['asic_embed'] + self.review_df['sic_embed'] agent_input_df = self.review_df["agent_embed"].progress_apply(lambda x: partial_pad_trunc(x)) self.agent_input_ids = np.array([item[0] for item in agent_input_df.values], dtype=np.long) self.agent_attention_mask = np.array([item[1] for item in agent_input_df.values], dtype=np.bool) self.agent_token_type_ids = np.array([item[2] for item in agent_input_df.values], dtype=np.long) if "anecdote_lead_final" in self.review_df.columns: # will convert the tag list to multi-label classification format # use this label and order to encode the labels. It will print the warning for I don't encode "" self.binarized_label = self.review_df["anecdote_lead_final"].astype(float).values print("finished!")
def author_top_field(pub2author_df, colgroupby = 'AuthorId', colcountby = 'FieldId', fractional_field_counts = False, show_progress=False): """ Calculate the most frequent field in the authors career. Parameters ---------- pub2author_df : DataFrame A DataFrame with the author2publication field information. colgroupby : str, default 'AuthorId' The DataFrame column with Author Ids. If None then the database 'AuthorId' is used. colcountby : str, default 'FieldId' The DataFrame column with Citation counts for each publication. If None then the database 'FieldId' is used. fractional_field_counts : bool, default False How to count publications that are assigned to multiple fields: - If False, each publication-field assignment is counted once. - If True, each publication is counted once, contributing 1/#fields to each field. Returns ------- DataFrame DataFrame with 2 columns: 'AuthorId', 'TopFieldId' """ check4columns(pub2author_df, [colgroupby, 'PublicationId', colcountby]) # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc='Author Top Field', disable= not show_progress) if not fractional_field_counts: author2field = pub2author_df.groupby(colgroupby)[colcountby].progress_apply(lambda x: x.mode()[0]) else: # first calculate how many fields each publication maps too pub2nfields = groupby_count(pub2author_df, colgroupby='PublicationId', colcountby=colcountby) # each pub2field mapping is weighted by the number of fields for the publication pub2nfields['PublicationWeight'] = 1.0/pub2nfields['PublicationIdCount'] del pub2nfields[str(colcountby)+'Count'] # merge counts author2field = pub2author_df.merge(pub2nfields, how='left', on='PublicationId') # custom weighted mode based on def weighted_mode(adf): p = adf.groupby(colcountby)['PublicationWeight'].sum() return p.idxmax() # now take the weighted mode for each groupby column author2field = author2field.groupby(colgroupby).progress_apply(weighted_mode) newname_dict = zip2dict([str(colcountby), '0'], ['Top' + str(colcountby)]*2) return author2field.to_frame().reset_index().rename(columns=newname_dict)
def compute_disruption_index(pub2ref, show_progress=False): """ Funk, Owen-Smith (2017) A Dynamic Network Measure of Technological Change *Management Science* **63**(3),791-817 Wu, Wang, Evans (2019) Large teams develop and small teams disrupt science and technology *Nature* **566**, 378–382 """ if show_progress: print("Starting computation of disruption index.") reference_groups = pub2ref.groupby('CitingPublicationId', sort=False)['CitedPublicationId'] citation_groups = pub2ref.groupby('CitedPublicationId', sort=False)['CitingPublicationId'] def get_citation_groups(pid): try: return citation_groups.get_group(pid).values except KeyError: return [] def disruption_index(citing_focus): focusid = citing_focus.name # if the focus publication has no references, then it has a disruption of None try: focusref = reference_groups.get_group(focusid) except KeyError: return None # implementation 1: keep it numpy #cite2ref = reduce(np.union1d, [get_citation_groups(refid) for refid in focusref]) #nj = np.intersect1d(cite2ref, citing_focus.values).shape[0] #nk = cite2ref.shape[0] - nj # implementation 2: but dicts are faster... cite2ref = { citeid: 1 for refid in focusref for citeid in get_citation_groups(refid) } nj = sum(cite2ref.get(pid, 0) for pid in citing_focus.values) nk = len(cite2ref) - nj ni = citing_focus.shape[0] - nj return (ni - nj) / (ni + nj + nk) # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc='Disruption Index', disable=not show_progress) newname_dict = { 'CitingPublicationId': 'DisruptionIndex', 'CitedPublicationId': 'PublicationId' } return citation_groups.progress_apply( disruption_index).to_frame().reset_index().rename(columns=newname_dict)
def cogroupby(df, N): adj_mat = spsparse.dok_matrix((N, N), dtype=int) def inducedcombos(authorlist): if authorlist.shape[0] >= 2: for i, j in combinations(authorlist, 2): adj_mat[i, j] += 1 tqdm.pandas(desc='CoAuthorship') df.groupby('PublicationId')['AuthorId'].progress_apply(inducedcombos) adj_mat = adj_mat + adj_mat.T return adj_mat
def groupby_count(df, colgroupby, colcountby, count_unique=True, show_progress=False): """ Group the DataFrame and count the number for each group. Parameters ---------- :param df: DataFrame The DataFrame. :param colgroupby: str The column to groupby. :param colcountby: str The column to count. :param count_unique: bool, default True If True, count unique items in the rows. If False, just return the number of rows. :param show_progress: bool or str, default False If True, display a progress bar for the count. If str, the name of the progress bar to display. Returns ---------- DataFrame DataFrame with two columns: colgroupby, colcountby+`Count` """ desc = '' if isinstance(show_progress, str): desc = show_progress # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc=desc, disable=not show_progress) newname_dict = zip2dict([str(colcountby), '0'], [str(colcountby) + 'Count'] * 2) if count_unique: count_df = df.groupby( colgroupby, sort=False)[colcountby].progress_apply(lambda x: x.nunique()) else: count_df = df.groupby( colgroupby, sort=False)[colcountby].progress_apply(lambda x: x.shape[0]) return count_df.to_frame().reset_index().rename(columns=newname_dict)
def qfactor(show_progress=False): """ This function calculates the Q-factor for an author. See [q] for details. References ---------- .. [q] Sinatra (2016): "title", *Science*. DOI: xxx """ # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc='Q-factor', disable=not show_progress) # TODO: implement return False
def compute_citation_rank(df, colgroupby='Year', colrankby='C10', ascending=True, normed=False, show_progress=False): """ Rank elements in the array from 0 (smallest) to N -1 (largest) Parameters ---------- :param df : DataFrame A DataFrame with the citation information for each Publication. :param colgroupby : str, list The DataFrame column(s) to subset by. :param colrankby : str The DataFrame column to rank by. :param ascending : bool, default True Sort ascending vs. descending. :param normed : bool, default False False : rank is from 0 to N -1 True : rank is from 0 to 1 :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- DataFrame The original dataframe with a new column for rank: colrankby+"Rank" """ # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc='Citation Rank', disable=not show_progress) df[str(colrankby) + "Rank"] = df.groupby(colgroupby)[colrankby].progress_transform( lambda x: rank_array(x, ascending, normed)) return df
def load_dataset(lang_path, tokenizer, max_length, balanced=False, dataset_name="test", limit=None): logging.getLogger("transformers.tokenization_utils_base").setLevel( logging.ERROR) tqdm.pandas(leave=False) # Read data df = pd.read_csv(lang_path + "/{}.csv".format(dataset_name.split("_")[0]), header=None) df.columns = ["sentiment", "review"] df["sentiment"] = pd.to_numeric( df["sentiment"]) # Sometimes label gets read as string # Remove excessively long examples lengths = df["review"].progress_apply(lambda x: len(tokenizer.encode(x))) df = df[lengths <= max_length].reset_index( drop=True) # Remove long examples # Balance classes if dataset_name == "train" and balanced: positive_examples = df["sentiment"].sum() if not limit: # Find which class is the minority and set its size as limit n = min(positive_examples, df.shape[0] - positive_examples) else: n = limit ones_idx = np.random.choice(np.where(df["sentiment"])[0], size=n) zeros_idx = np.random.choice(np.where(df["sentiment"] == 0)[0], size=n) df = df.loc[list(ones_idx) + list(zeros_idx)].reset_index(drop=True) elif not balanced and limit: raise Exception( "Must set 'balanced' to True to choose a manual limit.") # Convert to TF dataset dataset = bert_convert_examples_to_tf_dataset( [(Example(text=text, category_index=label)) for label, text in df.values], tokenizer, max_length=max_length) return df, dataset
def publication_beauty(pub2ref_df, colgroupby='CitedPublicationId', colcountby='CitingPublicationId', show_progress=False): """ Calculate the sleeping beauty and awakening time for each cited publication. See :cite:`Sinatra2016qfactor` for the derivation. The algorithmic implementation can be found in :py:func:`metrics.qfactor`. Parameters ---------- pub2ref_df : DataFrame, default None, Optional A DataFrame with the temporal citing information information. colgroupby : str, default 'CitedPublicationId', Optional The DataFrame column with Author Ids. If None then the database 'CitedPublicationId' is used. colcountby : str, default 'CitingPublicationId', Optional The DataFrame column with Citation counts for each publication. If None then the database 'CitingPublicationId' is used. Returns ------- DataFrame Trajectory DataFrame with 2 columns: 'AuthorId', 'Hindex' """ check4columns(pub2ref_df, ['CitedPublicationId', 'CitingPublicationId', 'CitingYear']) tqdm.pandas(desc='Beauty', disable=not show_progress) df = groupby_count(pub2ref_df, colgroupby=['CitedPublicationId', 'CitingYear'], colcountby='CitingPublicationId', count_unique=True) newname_dict = zip2dict([str(colcountby), '0', '1'], [str(colgroupby) + 'Beauty'] * 2 + ['Awakening']) return df.groupby(colgroupby)[colcountby + 'Count'].progress_transform( beauty_coefficient).rename(columns=newname_dict)
#TODO: read documentation !pip install --quiet tqdm==4.59.0 from tqdm.notebook import tqdm import pytorch_lightning as pl from sklearn.preprocessing import MinMaxScaler import torch import torch.autograd as autograd import torch.nn as nn import torch.nn.functional as f import torch.optim as optim from torch.utils.data import Dataset, DataLoader rcParams['figure.figsize'] = 13,7 tqdm.pandas() pl.seed_everything(7) df = pd.read_csv('/content/drive/MyDrive/allwind.csv') df = df.drop(['ISTANBUL WindSpeed(m/s)'], axis = 1) df = df.drop(['Unnamed: 0'] , axis = 1 ) df['Date'] = pd.date_range(start = '20180201' , freq = 'H' , periods = len(df)) df.dropna(inplace = True) df.head() # preprocessing: rowsData = [] for i,row in tqdm(df.iterrows() , total = len(df)):
def coauthorship_network(paa_df, focus_author_ids=None, focus_constraint='authors', show_progress=False): """ Create the co-authorship network. Parameters ---------- :param paa_df : DataFrame A DataFrame with the links between authors and publications. :param focus_author_ids : numpy array or list, default None A list of the AuthorIds to seed the coauthorship-network. :param focus_constraint : str, default `authors` If focus_author_ids is not None: `authors` : the `focus_author_ids' defines the node set, giving only the co-authorships between authors in the set. `publications` : the publication history of `focus_author_ids' defines the edge set, giving the co-authorhips where at least one author from `focus_author_ids' was involved. 'ego' : the `focus_author_ids' defines a seed set, such that all authors must have co-authored at least one publication with an author from `focus_author_ids', but co-authorships are also found between the second-order author sets. :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- coo_matrix The adjacency matrix for the co-authorship network author2int, dict A mapping of AuthorIds to the row/column of the adjacency matrix. """ required_columns = ['AuthorId', 'PublicationId'] check4columns(paa_df, required_columns) paa_df = paa_df[required_columns].dropna() if not focus_author_ids is None: focus_author_ids = np.sort(focus_author_ids) # identify the subset of the publications we need to form the network if focus_constraint == 'authors': # take only the publication-author links that have an author from the `focus_author_ids' paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values, focus_author_ids)] elif focus_constraint == 'publications': # take all publications authored by an author from the `focus_author_ids' focus_pubs = np.sort(paa_df.loc[isin_sorted( paa_df['AuthorId'].values, focus_author_ids)]['PublicationId'].unique()) # then take only the subset of publication-author links inducded by these publications paa_df = paa_df.loc[isin_sorted(paa_df['PublicationId'].values, focus_pubs)] del focus_pubs elif focus_constraint == 'ego': # take all publications authored by an author from the `focus_author_ids' focus_pubs = np.sort(paa_df.loc[isin_sorted( paa_df['AuthorId'].values, focus_author_ids)]['PublicationId'].unique()) # then take all authors who contribute to this subset of publications focus_author_ids = np.sort(paa_df.loc[isin_sorted( paa_df['PublicationId'].values, focus_pubs)]['AuthorId'].unique()) del focus_pubs # finally take the publication-author links that have an author from the above ego subset paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values, focus_author_ids)] # map authors to the row/column of the adj mat author2int = { aid: i for i, aid in enumerate(np.sort(paa_df['AuthorId'].unique())) } Nauthors = paa_df['AuthorId'].nunique() adj_mat = sparse.dok_matrix((Nauthors, Nauthors), dtype=int) def coauthor_cluster(author_list): if author_list.shape[0] >= 2: for ia, ja in combinations(author_list, 2): adj_mat[author2int[ia], author2int[ja]] += 1 # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc='CoAuthorship Relations', leave=True, disable=not show_progress) # go through all publications and apply the coauthorship edge generator paa_df.groupby('PublicationId')['AuthorId'].progress_apply( coauthor_cluster) adj_mat = adj_mat + adj_mat.transpose() return adj_mat, author2int
def deflate(nominal_values, nominal_dates, real_date, index='ipca', progress_bar=False, on_jupyter=False): """ deflatebr uses data from the Brazilian Institute for Applied Economic Research's API (IPEADATA) to adjust nominal Brazilian Reais for inflation. Parameters ---------- nominal_values : [int, float, np.array or pd.Series] An array containing nominal Brazilian Reais to deflate. nominal_dates : [str, date or list] A date vector with corresponding nominal dates (i.e., when nominal values were measured). Values are set to the previous month, following the standard methodology used by the Brazilian Central Bank https://www3.bcb.gov.br/CALCIDADAO/publico/metodologiaCorrigirIndice.do?method=metodologiaCorrigirIndice real_date : str A value indicating the reference date to deflate nominal values in the format 'YYYY-MM' (e.g., '2018-01' for January 2018). index : str Indicates the price index used to deflate nominal Reais. Valid options are: 'ipca', 'igpm,'igpdi', 'ipc', and 'inpc'. progress_bar : bool True to display a progress bar. False by default. on_jupyter : bool True to display an HTML progress bar on jupyter notebook or jupyter lab. Returns ------- np.ndarray : an array of deflated values. """ # Prepare inputs nominal_values = np.array(nominal_values) real_date = clean_real_date(real_date) # If it is just one value, turn into a list if isinstance(nominal_dates, str): nominal_dates = [pd.to_datetime(nominal_dates)] elif isinstance(nominal_dates, date): nominal_dates = [nominal_dates] if len(nominal_dates) > 1: nominal_dates = pd.to_datetime(nominal_dates) # Round dates to first of each month and get one month earlier nominal_dates = [round_date_to_month(dt) for dt in nominal_dates] # Test index input if index not in ['ipca', 'igpm', 'igpdi', 'ipc', 'inpc']: raise Exception( "index must be one of 'ipca', 'igpm', 'igpdi', 'ipc', 'inpc'") # Request to IPEA API if index == 'ipca': q = "http://ipeadata.gov.br/api/odata4/ValoresSerie(SERCODIGO='PRECOS12_IPCA12')" elif index == 'igpm': q = "http://ipeadata.gov.br/api/odata4/ValoresSerie(SERCODIGO='IGP12_IGPM12')" elif index == 'igpdi': q = "http://ipeadata.gov.br/api/odata4/ValoresSerie(SERCODIGO='IGP12_IGPDI12')" elif index == 'ipc': q = "http://ipeadata.gov.br/api/odata4/ValoresSerie(SERCODIGO='IGP12_IPC12')" elif index == 'inpc': q = "http://ipeadata.gov.br/api/odata4/ValoresSerie(SERCODIGO='PRECOS12_INPC12')" res = requests.get(q) indice = pd.DataFrame.from_dict(json.load(StringIO(res.text))['value']) indice['VALDATA'] = pd.to_datetime(indice['VALDATA'], utc=True).dt.date.astype(str) # Calculate changes in values real_indx = indice.loc[indice.VALDATA == real_date, 'VALVALOR'].values df = pd.DataFrame({'nom_values': nominal_values, 'VALDATA': nominal_dates}) df = df.merge(indice[['VALDATA', 'VALVALOR']], how='left', on='VALDATA') if progress_bar: if on_jupyter: from tqdm.notebook import tqdm tqdm.pandas() df['deflated'] = df[['nom_values', 'VALVALOR']].progress_apply(lambda x: ( (real_indx / x[1]) * x[0])[0], axis=1) else: from tqdm import tqdm tqdm.pandas() df['deflated'] = df[['nom_values', 'VALVALOR']].progress_apply(lambda x: ( (real_indx / x[1]) * x[0])[0], axis=1) else: df['deflated'] = df[['nom_values', 'VALVALOR']].apply(lambda x: ( (real_indx / x[1]) * x[0])[0], axis=1) return df.deflated.values
nltk.download("stopwords") import pandas as pd import numpy as np import spacy import statistics from fuzzywuzzy import fuzz from sklearn.feature_extraction.text import TfidfVectorizer import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer import re from bs4 import BeautifulSoup from tqdm.notebook import trange, tqdm tqdm.pandas(desc="Progress") # Now you can use `progress_apply` instead of `apply` # and `progress_map` instead of `map` import numpy as np import matplotlib.pyplot as plt import pickle import warnings warnings.filterwarnings("ignore") #Open Data in Pandas dataframe path = '/content/drive/My Drive/Case Studies/Quora Question Pairs/train.csv' #Loading data into pandas dataframe
def align_publications(df1, df2=None, columns2match_exact=['Year'], column2match_approx='Title', ntop=1, cosine_lower_bound=0.75, use_threads=False, n_jobs=2, lev_lower_bound=0.9, show_progress=False): """ Fast way to match publications between two datasets. We first match subsets of exact values between the two DataFrames, as specified by `columns2match_exact`. We then use a fast approximate string matching to match values in `columns2match_approx` to within a threshold. Parameters ---------- :param df1 : DataFrame A DataFrame with the publication information. :param df2 : DataFrame, Optional Another DataFrame with the publication information. If None, then df1 is used again. :param columns2match_exact : list, Default: ['Year'] The columns to match exactly between DataFrames. :param column2match_approx : list, Default: 'Title' The column to match approximately between DataFrames. :param ntop : int, Default 1 The number of best matches from df2 to return for each row of df1. :param lower_bound : float, Default 0.75 The lowerbound for cosine similarity when doing a fuzzy string match. :param use_threads : bool, Default False Use multithreading when calculating cosine similarity for fuzzy string matching. :param n_jobs : int, Optional, Default 2 If use_threads is True, the number of threads to use in the parall calculation. :param show_progress : bool, Default False If True, show a progress bar tracking the calculation. """ # we can do an exact match from merge if (columns2match_exact is None or len(columns2match_exact) > 0) and (column2match_approx is None or len(column2match_approx) == 0): # get the index name and reset the index to force it as a column indexcol = df2.index.name df2 = df2.reset_index(drop=False) # now merge the dataframes and drop duplicates giving an exact match mdf = df1.merge(df2[columns2match_exact + [indexcol]], how='left', on=columns2match_exact) mdf.drop_duplicates(subset=columns2match_exact, keep='first', inplace=True) return mdf[indexcol] # otherwise, if there is a column to match approximately then we need to prep for fuzzy matching elif len(column2match_approx) > 0: # we take a two-step approach to fuzzy matching # 1) first we employ a super fast but not very accurate cosine-similarity # matching to narrow down the possible pair-wise matches # for each string, we create feature vectors from 3-char counts tfidf = TfidfVectorizer(min_df=1, ngram_range = (3,3), analyzer='char', lowercase=False) tfidf1 = tfidf.fit_transform(df1[column2match_approx]) tfidf2 = tfidf.transform(df2[column2match_approx]) matches = np.empty(tfidf1.shape[0]) matches[:] = np.NaN # if there are no columns to match exactly if (columns2match_exact is None or len(columns2match_exact) == 0): # 1) first do the all-to-all cosine similarity and extract up to the ntop best possible matches co= awesome_cossim_topn(tfidf1, tfidf2.T, ntop=ntop, lower_bound=cosine_lower_bound, use_threads=use_threads, n_jobs=n_jobs).tocoo() # 2) now use the Levenshtein for row in tqdm(set(co.row), desc="Align Publications", disable=not show_progress): rowcol = co.col[co.row==row] argmatch, lev_dist = levenshtein_best_match(df1.loc[row, column2match_approx], df2.iloc[rowcol][column2match_approx]) if lev_dist >= lev_lower_bound: matches[row] = rowcol[argmatch] else: df2groups = df2.groupby(columns2match_exact) def subgroup_match(subdf): if not df2groups.indices.get(subdf.name, None) is None: sub_tfidf1 = tfidf1[subdf.index.values] sub_tfidf2 = tfidf2[df2groups.indices[subdf.name]] co = awesome_cossim_topn(sub_tfidf1, sub_tfidf2.transpose(), ntop=ntop, lower_bound=cosine_lower_bound, use_threads=use_threads, n_jobs=n_jobs).tocoo() # 2) now use the Levenshtein distance to find the best match for row in set(co.row): rowcol = co.col[co.row==row] argmatch, lev_dist = levenshtein_best_match(subdf.iloc[row][column2match_approx], df2.iloc[df2groups.indices[subdf.name][rowcol]][column2match_approx]) if lev_dist >= lev_lower_bound: matches[subdf.index.values[row]] = df2groups.indices[subdf.name][rowcol[argmatch]] # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc='Publication Matches', disable= not show_progress) df1.groupby(columns2match_exact, group_keys=True).progress_apply(subgroup_match) return matches
def augment_annotation(bam, ranges): with warnings.catch_warnings(): warnings.simplefilter("ignore") def extract_annot(row): # bam_data['reference_start'] >= 155179779 # start_data = bam_data[bam_data['reference_start'] >= 155179779] # TODO: There is something FUBAR in the start_data calculation bam_data = bam.get_sam_annotation(row.Chromosome, row.Start, row.End) if bam_data is None: return 0, 0, 0, 0, \ 0, 0, 0, 0, 0, 0, 0, 0, \ 0, 0 start_data = bam_data.loc[( (bam_data.reference_start + bam_data.reference_length <= row.End) & (bam_data.strand == "+") | ((bam_data.strand == "-") & (bam_data.reference_start >= row.Start)))] #start_data = bam_data[bam_data['reference_start'] >= row.Start] # rstart - the number of reads that start within the given interval rstart = len(start_data) # basesstart - the number of bases contained within rstart bases_start = start_data.reference_length.sum() # meanreadlen - mean read length for any reads within this interval mean_read_len = bam_data.reference_length.mean() # startreadlen - mean read length for reads that start within interval start_read_len = start_data.reference_length.mean() # strandp strand_p = (bam_data.strand == '+').sum() # strandn strand_n = (bam_data.strand == '-').sum() # mapq - mapq for reads starting in segment mapq = (-10 * log10( (10**(start_data.mapping_quality / -10)).mean())) # map0 - mapq for reads overlapping the segment map0 = (-10 * log10((10**(bam_data.mapping_quality / -10)).mean())) # readq - per read q score for reads starting in segment readq = (-10 * log10( (10**(start_data.mapped_read_q / -10)).mean())) # read0 - per read q score for reads overlapping segment read0 = (-10 * log10((10**(bam_data.mapped_read_q / -10)).mean())) # nm - this is the #NM mismatch count; reads starting in segment nm = start_data.nm.sum() # cigar_del cigar_d = start_data.cigar_d.sum() # cigar_ins cigar_i = start_data.cigar_i.sum() # cigar_mapped cigar_m = start_data.cigar_m.sum() ##### and some local sequence context annotations # gccount # ncount return rstart, bases_start, mean_read_len, start_read_len, \ strand_p, strand_n, mapq, map0, readq, read0, nm, cigar_m, \ cigar_i, cigar_d tqdm.pandas() df_data = ranges.df df_data[[ 'rstart', 'bases_start', 'mean_read_len', 'start_read_len', 'strand_p', 'strand_n', 'mapq', 'map0', 'readq', 'read0', 'nm', 'cigar_m', 'cigar_i', 'cigar_d' ]] = df_data.progress_apply(extract_annot, axis=1, result_type='expand') return pr.PyRanges(df_data)
def create_person_offer(transcript, portfolio, profile, person_transaction): """ A function to generete a new df with person and offer per row, Arguments: transcript -- Dataframe that contains all events portfolio -- Dataframe that contains datails of offers profile -- Dataframe that contains details about customers Returns: person_offer_df -- new DataFrame """ tqdm.pandas() to_be_appended = None # This will not include transaction, so we need another new table for those. for (person_id, offer_index), transcript_grouped in tqdm( transcript.dropna(subset=['offer_index']).groupby( ['person', 'offer_index'])): this_offer = portfolio.loc[offer_index] this_person = profile.loc[person_id] to_be_appended = append_one_person_offer(to_be_appended, this_offer, person_id, offer_index, transcript_grouped, this_person) person_offer_df = pd.DataFrame(to_be_appended) # TODO, the stuff above and the stuff below was originally made at completly different times and # was different files and functions, now I put it into one, however, there's still probably alot # that can be done together instead of looping multiple times on the same stuff... but not reealy needed, as it works # but could probably make it way faster... person_offer_df['before_start'] = 0 person_offer_df['same_day_start'] = 0 person_offer_df['after_start'] = 0 person_offer_df['before_view'] = 0 person_offer_df['same_day_view'] = 0 person_offer_df['after_view'] = 0 person_offer_df['before_complete'] = 0 person_offer_df['same_day_complete'] = 0 person_offer_df['after_complete'] = 0 person_offer_df['w_before'] = 0 person_offer_df['sum_during'] = 0 person_offer_df['mean_during'] = 0 person_offer_df['w_after'] = 0 person_offer_df = person_offer_df.progress_apply( get_before_after_mean, person_transaction=person_transaction, axis=1) person_offer_df['viewed_reltime'] = np.nan person_offer_df['completed_reltime'] = np.nan def absulute2relative_time(x): """Converts absolute time (hours since start of simulation) to hours since offer recieved (start) """ if x.viewed: x.viewed_reltime = x.viewed_time - x.start if x.completed: x.completed_reltime = x.completed_time - x.start return x person_offer_df = person_offer_df.progress_apply(absulute2relative_time, axis=1) #makes it easier to access these combinations person_offer_df['complete_viewed'] = ( person_offer_df['completed'] & person_offer_df['viewed']).astype(int) person_offer_df['complete_not_viewed'] = ( person_offer_df['completed'] & ~person_offer_df['viewed']).astype(int) person_offer_df['not_complete_not_viewed'] = ( ~person_offer_df['completed'] & ~person_offer_df['viewed']).astype(int) person_offer_df['not_complete_viewed'] = ( ~person_offer_df['completed'] & person_offer_df['viewed']).astype(int) person_offer_df['completed'] = person_offer_df['completed'].astype(int) person_offer_df['viewed'] = person_offer_df['viewed'].astype(int) #calculates diff in sales before and after an event for x in ['start', 'view', 'complete']: person_offer_df[f'diff_{x}'] = person_offer_df[ f'after_{x}'] - person_offer_df[f'before_{x}'] person_offer_df[f'diff_offer'] = person_offer_df[ f'w_after'] - person_offer_df[f'w_before'] #recalculates became_member_on to member_since instead (where newest member is 0) in days person_offer_df['became_member_on'] = pd.to_datetime( person_offer_df['became_member_on'], format='%Y-%m-%d') person_offer_df['member_since_days'] = ( person_offer_df['became_member_on'].max() - person_offer_df['became_member_on']).dt.days #remve these wrong ages and turn to NaN person_offer_df['age'] = person_offer_df['age'].apply(lambda x: np.nan if x == 118 else x) return person_offer_df
def run( self, value: Optional[str] = None, data: Optional[pd.DataFrame] = None, timespan: Optional[TimeSpan] = None, options: Optional[Iterable[str]] = None, **kwargs, ) -> TIEnrichResult: """ Return an enriched set of Alerts. Parameters ---------- timespan : TimeSpan Timespan for queries options : Optional[Iterable[str]], optional List of options to use, by default None. A value of None means use default options. Options prefixed with "+" will be added to the default options. To see the list of available options type `help(cls)` where "cls" is the notebooklet class or an instance of this class. value: Optional[str], optional If you want to filter Alerts based on a specific entity specify it as a string. data: Optional[pd.DataFrame], optional If you have alerts in a DataFrame you can pass them rather than having the notebooklet query alerts. Returns ------- TIEnrichResult Result object with attributes for each result type. Raises ------ MsticnbMissingParameterError If required parameters are missing MsticnbDataProviderError If data is not avaliable """ # This line use logic in the superclass to populate options # (including default options) into this class. super().run(value=value, data=data, timespan=timespan, options=options, **kwargs) if not timespan and data is None: raise MsticnbMissingParameterError("timespan.") # If data is not provided, query Sentinel to get it. if data is None: nb_print("Collecting alerts") if value is not None: data = _get_all_alerts(self.query_provider, timespan, value) else: data = _get_all_alerts(self.query_provider, timespan) # Create a result class # Add description to results for context self._last_result = TIEnrichResult(description=f"""Enriched alerts, with the filter of {value}""") # Establish TI providers if "tilookup" in self.data_providers.providers: ti_prov = self.data_providers.providers["tilookup"] else: raise MsticnbDataProviderError("No TI providers avaliable") if isinstance(data, pd.DataFrame) and not data.empty: data["Entities"] = data["Entities"].apply(_entity_load) tqdm.pandas(desc="TI lookup progress") ti_sec = False if "secondary" in self.options: ti_sec = True md("""Alerts enriched with threat intelligence - TI Risk is the the hightest score provided by any of the configured providers.""") data["TI Risk"] = data.progress_apply( lambda row: _lookup(row, ti_prov, secondary=ti_sec), axis=1) if not self.silent: display(data[[ "StartTimeUtc", "AlertName", "Severity", "TI Risk", "Description", ]].sort_values(by=["StartTimeUtc"]).style.applymap( _color_cells).hide_index()) if "details" in self.options: self._last_result.picker = _alert_picker(data, ti_prov, secondary=ti_sec, silent=self.silent) else: raise MsticnbDataProviderError("No alerts avaliable") self._last_result.enriched_results = data return self._last_result