def tag_lang(data, txt_var='text_clean'):
    """
    Tag language in all text in data, 
    return language, score and post IDs.
    
    :param data: data frame
    :param txt_var: text var
    :returns lang_id_data:: lang, score and post ID
    """
    lang_id_model = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    # parallel
    MAX_JOBS=5
    pandarallel.initialize(nb_workers=MAX_JOBS)
    lang_score_vals = data.loc[:, txt_var].parallel_apply(lang_id_model.classify)
    # serial
    # TODO: why does langid wreck CPU use?
#     lang_score_vals = data.loc[:, txt_var].apply(lang_id_model.classify)
    # separate lang/score
    lang_val, lang_score = zip(*lang_score_vals)
    lang_var = 'lang'
    lang_score_var = 'lang_score'
    post_id_var = 'id'
    data = data.assign(**{
        lang_var : lang_val,
        lang_score_var : lang_score,
    })
    lang_id_data = data.loc[:, [lang_var, lang_score_var, post_id_var]]
    return lang_id_data
示例#2
0
def plot_PM_vs_sent_all_sites(master_csv):
    '''
    Computes and plots the Pearson coefficient of PM2.5 vs. sentinel band
    means across all entries in the master .csv (i.e. all readings at all sites).
    '''
    pandarallel.initialize()

    df = pd.read_csv(master_csv)
    subdir = "visuals/repaired/allsites/"
    
    for band in [0]: #range(0,13):

        label = 'b' + str(band+1)+ ' mean'
        df[label] = df.parallel_apply(get_sentinel_band_mean, band=band, axis=1)
        pearson = pearsonr(df[label], df['Daily Mean PM2.5 Concentration'])
        mean_day_diff =  df['PM Reading/Image day difference'].mean()
                
        if np.isnan(pearson[0]):
            continue
                                    
        print("Band {} pearson: {}. Day difference: {}".format(band, pearson, mean_day_diff))
        
        pearson_str = "r = " + str(round(pearson[0], 3))
        plt.scatter(df[label], df['Daily Mean PM2.5 Concentration'], s=1, label = pearson_str)
        plt.xlabel("Sentinel Band " + str(band) + " Mean Value")
        plt.ylabel("PM2.5 Concentration")
        plt.title("Sentinel Band " + str(band) + " Mean vs. PM2.5 value across all sites", fontsize=16)
        plt.legend(loc='upper left') 
        plt.savefig(subdir + "PM_vs_sent_band_" +str(band)+"_mean_for_all_sites.png")            
        plt.show()
        plt.clf()
                
    print("Pearson across dataset: {}".format(pearson))
def calc_all_embeddings(df, model, cfg):
    """Calculate all document embeddings."""

    # pick representative sentences from documents
    print("[clustering] choosing representative sentences...")
    if cfg.rep_sents_path != "":
        rep_sents = np.load(cfg.rep_sents_path, allow_pickle=True)
    elif cfg.num_workers == 1:
        rep_sents = df.apply(pick_rep_sentences, axis=1).to_numpy()
    else:
        pandarallel.initialize(nb_workers=cfg.num_workers)
        rep_sents = df.parallel_apply(pick_rep_sentences, axis=1).to_numpy()
    np.save("model/clustering/rep_sents.npy", rep_sents, allow_pickle=True)

    # finetune
    if cfg.sbert_finetune:
        print(f"[clustering] finetuning the model...")
        finetune_sbert(model, df, rep_sents, cfg.sbert_finetune_cfg)

    # calculate document embeddings
    print("[clustering] calculating document embeddings...")
    if cfg.doc_embs_path != "":
        doc_embs = np.load(cfg.doc_embs_path, allow_pickle=False)
    else:
        rs_tqdm = tqdm(rep_sents, position=0)
        doc_embs = np.array([embed_doc(rs, model) for rs in rs_tqdm])
    np.save("model/clustering/doc_embs.npy", doc_embs, allow_pickle=False)
    return doc_embs
示例#4
0
def transform(test_size: int) -> None:
    """Transform category classification dataset into fastText format.

    The original CSV file is split into *.train and *.test files where each file
    contains lines in form of '__label__<actual_label> sample text'.

    """
    pandarallel.initialize()

    cat_df = pd.read_csv(paths.CATEGORY_DATA_PATH)
    cat_df = cat_df.sample(frac=1)

    # Transform category dataset into fastText format.
    lines = cat_df.parallel_apply(
        lambda ad: f"__label__{ad['label']} {ad['sample']}", axis=1)

    test_index = int(test_size * len(lines))

    test_data = lines[:test_index]
    train_data = lines[test_index:]

    logger.info(
        f"Transformed {len(train_data)} train and {len(test_data)} test samples"
    )

    with open(paths.FASTTEXT_CATEGORY_TRAIN_PATH, "w") as fp:
        fp.writelines("\n".join(train_data))
        logger.info(
            f"Saved train dataset to '{paths.FASTTEXT_CATEGORY_TRAIN_PATH}'")

    with open(paths.FASTTEXT_CATEGORY_TEST_PATH, "w") as fp:
        fp.writelines("\n".join(test_data))
        logger.info(
            f"Saved test dataset to '{paths.FASTTEXT_CATEGORY_TEST_PATH}'")
def compute_BM25(corpus_df: pd.DataFrame, query_df: pd.DataFrame,
                 data_col: str, f_name: str, reindex: False) -> np.array:
    pandarallel.initialize()
    base_path = "/lfs/1/sahaana/enrichment/data/Okapi25Queries"
    corpus = list(corpus_df[data_col].parallel_apply(lambda x: x.split()))
    indexed = BM25Okapi(corpus)
    bm25 = query_df[data_col].parallel_apply(
        lambda x: indexed.get_scores(x.split()))
    bm25 = np.vstack(bm25)
    np.save(f"{base_path}/{f_name}.npy", bm25)
    final = np.argsort(bm25, axis=1)

    if not reindex:
        np.save(f"{base_path}/{f_name}_argsort.npy", final)
        print(f"Saved {f_name}")
        return final
    else:
        corpus_indexes = np.array(corpus_df.index)
        query_index = np.array(query_df.index)

        final = corpus_indexes[final]
        np.save(f"{base_path}/{f_name}_argsort.npy", final)
        np.save(f"{base_path}/{f_name}_QIDs.npy", query_index)
        print(f"Saved {f_name}")
        return query_index, bm25, final
示例#6
0
def extract_features(df_transactions: pd.DataFrame,
                     model_folder: str,
                     progress_bar: bool = False) -> pd.DataFrame:

    from .feng_utils import calc_features
    from pandarallel import pandarallel

    pandarallel.initialize(progress_bar=progress_bar)

    ttype_direction_mapping, mcc_group_mapping = joblib.load(
        str(Path(model_folder) / "ttype_mcc_group_mappings.pkl"))
    ttypes = [*ttype_direction_mapping]
    mcc_groups = [*mcc_group_mapping]

    df_features = (df_transactions.groupby(["user_id"]).parallel_apply(
        lambda group: calc_features(group, ttypes, mcc_groups)).apply(
            pd.Series))

    returning_user_lookup_table_df = pd.read_csv(
        str(Path(model_folder) / "returning_user_lookup_table_df.csv"))
    returning_user_lookup_table_df.set_index("user_id", inplace=True)

    df_features = df_features.join(returning_user_lookup_table_df)
    df_features["is_new_customer"] = df_features[
        "prev_monthly_in_flow_avg"].isna()

    inflow_model_features = joblib.load(
        str(Path(model_folder) / "inflow_model_features.pkl"))
    outflow_model_features = joblib.load(
        str(Path(model_folder) / "outflow_model_features.pkl"))
    extracted_features = df_features.columns
    assert set(extracted_features) >= set(inflow_model_features) and set(
        extracted_features) >= set(outflow_model_features)

    return df_features
示例#7
0
文件: sgt.py 项目: tejaswini0608/sgt
    def transform(self, corpus):
        '''
        Inputs:
        corpus       A list of sequences. Each sequence is a list of alphabets.
        '''
        '''
        Difference between fit_transform and transform is:
        In transform() we have the alphabets already  known.
        In fit_transform() is alphabets are not known, they
        are computed.
        The computation in fit is essentially getting the
        alphabets set.
        '''

        if self.mode == 'default':
            sgt = corpus.apply(
                lambda x: [x['id']] + list(self.fit(x['sequence'])),
                axis=1,
                result_type='expand')
            sgt.columns = ['id'] + self.feature_names
            return sgt
        elif self.mode == 'multiprocessing':
            # Import
            from pandarallel import pandarallel
            # Initialization
            pandarallel.initialize(nb_workers=self.processors)
            sgt = corpus.parallel_apply(
                lambda x: [x['id']] + list(self.fit(x['sequence'])),
                axis=1,
                result_type='expand')
            sgt.columns = ['id'] + self.feature_names
            return sgt
示例#8
0
def cylinder_fit(self):

    print('running new version')

    #     if 'sf_radius' in self.centres.columns:
    #         del self.centres['sf_radius']

    for c in self.centres.columns:
        if 'sf' in c: del self.centres[c]

    node_id = self.centres[self.centres.n_points > self.min_pts].sort_values(
        'n_points').node_id.values

    groupby_ = self.pc.loc[self.pc.node_id.isin(node_id)].groupby('node_id')
    pandarallel.initialize(progress_bar=True, verbose=2)

    cyl = groupby_.parallel_apply(RANSAC_helper)
    #     cyl = groupby_.apply(RANSAC_helper)
    cyl.columns = ['sf_radius', 'centre']
    cyl.reset_index(inplace=True)
    cyl.loc[:, 'sf_cx'] = cyl.centre.apply(lambda c: c[0])
    cyl.loc[:, 'sf_cy'] = cyl.centre.apply(lambda c: c[1])
    cyl.loc[:, 'sf_cz'] = cyl.centre.apply(lambda c: c[2])
    self.centres = pd.merge(
        self.centres,
        cyl[['node_id', 'sf_radius', 'sf_cx', 'sf_cy', 'sf_cz']],
        on='node_id',
        how='left')
示例#9
0
def customer_file_parse(args, input_sentence):

    nltk.download('stopwords')
    nltk.download('punkt')

    if args.paraphrase_corpus and args.input_file_path:
        output_path = os.path.join(args.data_dir, 'new_test.tsv')
        print('File reading...')
        input_file_path = args.input_file_path
        corpus_sentences = []

        data = pd.read_csv(input_file_path, sep='\t')

        pandarallel.initialize()

        groups = data.groupby("#2 String")
        num = groups.count()['#1 ID'].iloc[0]
        g = groups.parallel_apply(group_func).set_index(
            'Quality').reset_index()
        if num > 10:
            recall = g['recall'].sum() / groups.ngroups
            print("tf-idf recall: " + str(recall))

        g.to_csv(output_path, index=False, sep='\t')

    return corpus_sentences
示例#10
0
 def init_okapi25():
     corpus = list(self.df_r[data_col].apply(lambda x: x.split()))
     indexed = BM25Okapi(corpus)
     pandarallel.initialize()
     bm25 = self.df_r[data_col].parallel_apply(
         lambda x: indexed.get_scores(x.split()))
     return np.argsort(bm25, axis=1)
示例#11
0
def build_people_using_nhanes_for_sampling(nhanes,
                                           n,
                                           outcome_model_repository,
                                           filter=None,
                                           random_seed=None,
                                           weights=None):
    if weights is None:
        weights = nhanes.WTINT2YR
    repeated_sample = nhanes.sample(n,
                                    weights=weights,
                                    random_state=random_seed,
                                    replace=True)
    pandarallel.initialize(verbose=1)
    people = repeated_sample.parallel_apply(
        build_person,
        outcome_model_repository=outcome_model_repository,
        axis="columns")

    for i in range(0, len(people)):
        people.iloc[i]._populationIndex = i

    if filter is not None:
        people = people.loc[people.apply(filter)]

    return people
示例#12
0
文件: sgt.py 项目: tejaswini0608/sgt
    def fit_transform(self, corpus):
        '''
        Inputs:
        corpus       A list of sequences. Each sequence is a list of alphabets.
        '''

        if (len(self.alphabets) == 0):
            self.alphabets = self.estimate_alphabets(corpus['sequence'])
            self.feature_names = self.__set_feature_names(self.alphabets)

        if self.mode == 'default':
            sgt = corpus.apply(
                lambda x: [x['id']] + list(self.fit(x['sequence'])),
                axis=1,
                result_type='expand')
            sgt.columns = ['id'] + self.feature_names
            return sgt
        elif self.mode == 'multiprocessing':
            # Import
            from pandarallel import pandarallel
            # Initialization
            pandarallel.initialize(nb_workers=self.processors)
            sgt = corpus.parallel_apply(
                lambda x: [x['id']] + list(self.fit(x['sequence'])),
                axis=1,
                result_type='expand')
            sgt.columns = ['id'] + self.feature_names
            return sgt
示例#13
0
def skeleton(self, eps):

    # run pandarallel on groups of points
    groupby = self.pc.groupby('slice_id')
    pandarallel.initialize(nb_workers=min(24, len(groupby)),
                           progress_bar=False)
    sent_back = groupby.parallel_apply(find_centre, self, eps).values

    # create and append clusters and filtered pc
    self.centres = pd.DataFrame()
    self.pc = pd.DataFrame()
    for x in sent_back:
        self.centres = self.centres.append(x[0])
        self.pc = self.pc.append(x[1])

    # reset index as appended df have common values
    self.centres.reset_index(inplace=True)
    self.pc.reset_index(inplace=True)

    # convert binary cluster reference to int
    MAP = {v: i for i, v in enumerate(self.centres.idx.unique())}
    if 'level_0' in self.pc.columns: self.pc = self.pc.drop(columns='level_0')
    if 'index' in self.pc.columns: self.pc = self.pc.drop(columns='index')
    self.pc.loc[:, 'node_id'] = self.pc.idx.map(MAP)
    self.centres.loc[:, 'node_id'] = self.centres.idx.map(MAP)
def compute_true_month_averages(master_csv, true_averages_csv):
    '''
    Computes the ground truth PM monthly averages from daily labels
    from a given master csv file and saves to the provided
    true_averages_csv file.
    '''
    pandarallel.initialize()

    df = pd.read_csv(master_csv)

    # Index on 'Month' and 'Site Id' to compute averages at each station for the month
    months = df.parallel_apply(get_month, axis=1)
    df['Month'] = months

    epa_stations = df['Site ID'].unique()
    num_sites = len(epa_stations)

    with open(true_averages_csv, 'a') as fd:
        writer = csv.writer(fd)
        writer.writerow(["Site ID", "Month", "Month Average"])
        for i, station_id in enumerate(epa_stations):

            station_datapoints = df[df['Site ID'] == station_id]

            for month in range(1, 13):

                month_m_at_station_i = station_datapoints[
                    station_datapoints['Month'] == month]
                pms_for_month_m_at_station_i = month_m_at_station_i[
                    'Daily Mean PM2.5 Concentration']
                month_average = np.mean(pms_for_month_m_at_station_i)
                row = [station_id, month, month_average]
                writer.writerow(row)
示例#15
0
def create_pretokenized_dataset():

    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')

    # Use every core on the machine.
    pandarallel.initialize(use_memory_fs=False)

    config = deepLegisConfig("bert_128.json")

    # Create a dataframe out of the ml_data.csv by adding the text to it.
    df, _ = createDeepLegisDataFrame(config, read_cached=False)

    # Take the text and tokenize it into the final product the model wants to see.
    tokenizer = config.tokenizer

    def tokenizer_wrapper(text):
        d = tokenizer(text,
                      truncation=True,
                      padding='max_length',
                      max_length=config.max_length)
        return (d['input_ids'])

    tic = time.perf_counter()
    df['tokens'] = df.text.parallel_apply(tokenizer_wrapper)
    toc = time.perf_counter()

    logger.info(
        f"Tokenized in {(toc-tic)/60.0} min -  {toc - tic:0.4f} seconds")

    print(df.head())

    # Save it for later use
    pickle_file = config.data_vol + "preprocessed_df_128.pkl"
    pickle.dump(df, open(pickle_file, "wb"))
示例#16
0
def main():

    out_path = "/bigtemp/rm5tx/nlp_project/2016-06_all_predicted.csv"
    model = ProjModel.load_from_checkpoint(
        checkpoint_path=os.path.expanduser("~/saved_models/last.ckpt"))
    tprint("Model Loaded")
    DATA_PATH = os.path.expanduser("~/data_cache/")

    data = ProjData(max_len=128)
    data.load(DATA_PATH)
    tprint("Data Loaded")

    neg_data = get_dataset("/localtmp/rm5tx/2016-06_all.csv")

    # pool = mp.Pool(processes = (15)
    pandarallel.initialize(nb_workers=18)

    tprint("Starting Tokenize")
    neg_data['tokenized'] = neg_data['data'].parallel_map(
        lambda x: tokenize(data.max_len, data.tokenizer, x))
    tprint("Finished Tokenize")

    torch.save(neg_data, open(DATA_PATH + "df_to_be_inferred.pt", "wb"))
    neg_data.to_csv(DATA_PATH + "df_to_be_inferred.csv")

    inputs = torch.tensor(neg_data['tokenized'])
    masks = inputs.ne(0)

    neg_data.to_csv(DATA_PATH + "df_to_be_inferred.csv")
    torch.save(inputs, open(DATA_PATH + "inputs_to_be_inferred.pt", "wb"))

    tprint("Saved Inputs")

    labels = []
    masked_input = TensorDataset(inputs, masks)
    dataloader = DataLoader(masked_input, batch_size=1000)
    model.eval()
    for batch in dataloader:
        b_input, b_mask = batch
        #print(b_input.shape)
        #print(b_mask.shape)
        #print(model(b_input,b_mask).shape)
        labels.extend(model(b_input, b_mask).tolist())
    #print(labels)
    #model.eval()
    #print(type(model))
    #print(model(inputs,masks).shape)

    #sents = neg_data['data'].tolist()
    #sents = ["random sentence", "pretty flowers", "idiot", "f**k you c**t nigger"]
    #xs,masks = data.process(sents)
    #for sent in sents:
    #    x, mask = data.process(sent)
    #print(sent,' ',model(x, mask).item())
    #    labels.append(model(x, mask).item())

    neg_data["label"] = pd.Series(labels)
    #print(neg_data[["data","label","author"]])
    neg_data.to_csv(out_path)
示例#17
0
def main(current_price_strategy: str, price_update_strategy: str,
         market_extract_path: Path, output_path: Path, force_update: bool):
    set_log_conf(log_path=os.getcwd())
    pandarallel.initialize(progress_bar=True)
    logger = logging.getLogger(__name__)
    logger.info("Starting run")

    client = CardMarketClient()
    current_price_computer = CurrentPriceComputer(
        strategy_name=current_price_strategy)
    price_updater = PriceUpdater(strategy_name=price_update_strategy)

    # Get the stock as a dataframe
    stock_df = client.get_stock_df()

    # Set the market extract path
    set_market_extract_path(market_extract_parent_path=market_extract_path)

    # Load the saved product prices
    reset_market_extract(force_update=force_update)

    _get_product_market_extract = partial(get_single_product_market_extract,
                                          card_market_client=client)

    def get_product_price(product_id):
        market_extract = _get_product_market_extract(product_id=product_id)
        try:
            return current_price_computer.get_current_price_from_market_extract(
                market_extract=market_extract)
        except SuitableExamplesShortage:
            # We try with a larger request in case of a lack of suitable examples
            market_extract = _get_product_market_extract(product_id=product_id,
                                                         max_results=500)
            try:
                return current_price_computer.get_current_price_from_market_extract(
                    market_extract=market_extract)
            except SuitableExamplesShortage:
                return float("nan")

    # Put the product prices in the df
    try:
        stock_df["ActualPrice"] = stock_df["idProduct"].parallel_apply(
            get_product_price)
    except Exception as error:
        logger.error(error)
        raise
    finally:
        stock_df.to_csv(output_path / "stock.csv")

    # Computes the new price
    stock_df = price_updater.update_df_with_new_prices(stock_df=stock_df)

    # Saves the result
    stock_df.to_csv(output_path / "stock.csv")
    # Saves only the updated prices separately
    stock_df.loc[~pd.isna(stock_df["NewPrice"])].to_csv(output_path /
                                                        "updated_stock.csv")

    logger.info("End of the run")
示例#18
0
def init():
    pandarallel.initialize()
    our_domain = 'localhost:7070'
    locator = Nominatim(domain=our_domain,
                        scheme='http',
                        user_agent="myGeocoder2")
    geocode = RateLimiter(locator.reverse, min_delay_seconds=0.3)
    return geocode
示例#19
0
def checkOS():
	if platform == "linux" or platform == "linux2" or platform == "darwin":
		print("Found *NIX like System.")
		from pandarallel import pandarallel #import when we need it
		pandarallel.initialize(nb_workers=CORES, verbose=0)
		isUnix = True
	else:
		print("Found Non-*nix like System.")
		isUnix = False
def create_non_tda_features(path,
                            fourier_window_size=[],
                            rolling_mean_size=[],
                            rolling_max_size=[],
                            rolling_min_size=[],
                            mad_size=[],
                            fourier_coefficients=[]):
    """
    INPUT:
        path: int (number to OpenML dataset)
        fourier_window_size: a list of window sizes. Note: min must be > max(fourier_coefficients)
        rolling_mean_size: a list of window sizes
        rolling_max_shift: a list of window sizes
        rolling_min_shift: a list of window sizes
        mad_size: a list of window sizes
        fourier_coefficients: a list of all fourier coefficients to include.
                              Note: max must be < min(fourier_window_size)
    OUTPUT:
        df: pandas dataframe with columns:
            max_... for rolling max features
            min_... for rolling min features
            mean_... for rolling mean features
            mad_... for rolling mad features
            fourier_... for fourier coefficients
    """

    df = get_dataset(path)
    df = df.get_data()[0]
    df.rename({
        'label': 'y',
        'coord_0': 'x',
        'coord_1': 'x_dot'
    },
              axis='columns',
              inplace=True)

    pandarallel.initialize()

    for r in rolling_max_size:
        df['max_' + str(r)] = df['x'].rolling(r).max()
    for r in rolling_mean_size:
        df['mean_' + str(r)] = df['x'].rolling(r).mean()
    for r in rolling_min_size:
        df['min_' + str(r)] = df['x'].rolling(r).min()
    for r in mad_size:
        df['mad_' + str(r)] = df['x'] - df['x'].rolling(r).min()
    if (not fourier_coefficients
            and fourier_window_size) or (not fourier_window_size
                                         and fourier_coefficients):
        print('Need to specify the fourier coeffcients and the window size')
    for n in fourier_coefficients:
        df[f'fourier_w_{n}'] = df['x'].rolling(
            fourier_window_size).parallel_apply(lambda x: rfft(x)[n],
                                                raw=False)
    # Remove all rows with NaNs
    df.dropna(axis='rows', inplace=True)
    return df
示例#21
0
def add_lang(dataframe):
    '''
    add language as a new column
    [dataframe] : pandas dataframe
    '''
    df_copy = deepcopy(dataframe)
    pandarallel.initialize(progress_bar=True)
    df_copy['lang'] = df_copy['text'].parallel_apply(detect_lang)

    return df_copy
示例#22
0
def process_aquifer_shapefile(shapefile: Any, region_id: int, name_attr: str,
                              id_attr: str, app_workspace: Any) -> Dict:
    """
    Process uploaded auifer shapefile

    Args:
        shapefile: List of shapefile files
        region_id: Region id as listed in the database
        name_attr: Aquifer Name Column
        id_attr: Aquifer Id Column
        app_workspace: Temp App workspace

    Returns:
        Response dict with success or error string
    """
    session = get_session_obj()
    temp_dir = None

    def add_aquifer_apply(row):
        aquifer = Aquifer(region_id=region_id,
                          aquifer_name=row.aquifer_name,
                          aquifer_id=row.aquifer_id,
                          geometry=row.geometry)
        return aquifer

    try:
        start_time = time.time()
        pandarallel.initialize()
        gdf, temp_dir = get_shapefile_gdf(shapefile, app_workspace)
        gdf = gdf.dissolve(by=name_attr, as_index=False)
        # gdf.to_csv('texas_aquifers.csv')
        rename_cols = {name_attr: 'aquifer_name', id_attr: 'aquifer_id'}
        gdf.rename(columns=rename_cols, inplace=True)
        gdf = gdf[['aquifer_name', 'aquifer_id', 'geometry']]
        aquifer_list = gdf.parallel_apply(add_aquifer_apply, axis=1)

        session.add_all(aquifer_list)
        session.commit()
        session.close()
        end_time = time.time()
        total_time = (end_time - start_time)

        return {"success": "success", "total_time": total_time}

    except Exception as e:
        session.close()
        if temp_dir is not None:
            if os.path.exists(temp_dir):
                shutil.rmtree(temp_dir)
        return {"error": str(e)}
    finally:
        # Delete the temporary directory once the shapefile is processed
        if temp_dir is not None:
            if os.path.exists(temp_dir):
                shutil.rmtree(temp_dir)
def main(input_file, output_folder):
    pandarallel.initialize(progress_bar=True)

    output_folder = Path(output_folder)
    output_folder.mkdir(parents=True, exist_ok=True)

    def extract(file_name):
        return extract_meta_data(file_name, output_folder)

    file_names = pd.read_csv(input_file, header=0, names=['path'], index_col=None)
    file_names.path.parallel_map(extract)
示例#24
0
def flip_reads(df, t=1):
    if t == 1:
        df['trim_seq'] = df.apply(flip_reads_x, axis=1)
    else:
        pandarallel.initialize(nb_workers=t)
        df['trim_seq'] = df.parallel_apply(flip_reads_x, axis=1)

    df.seq = df.trim_seq
    df.drop('trim_seq', axis=1, inplace=True)

    return df
示例#25
0
    def __init__(self, person, n):
        self._outcome_model_repository = OutcomeModelRepository()
        self._qaly_assignment_strategy = QALYAssignmentStrategy()
        self._risk_model_repository = CohortRiskModelRepository()
        self.n = n

        people = pd.Series([copy.deepcopy(person) for i in range(0, n)])

        pandarallel.initialize(verbose=1)
        for i in range(0, len(people)):
            people.iloc[i]._populationIndex = i
        super().__init__(people)
示例#26
0
def convert_to_ids(df, save_dir):
    global id_col
    global freq_bound
    global attribute_columns
    pandarallel.initialize()

    feature_columns = list(sorted(attribute_columns))
    dict_DomainDims = {}
    col_val2id_dict = {}

    for col in feature_columns:
        vals = list(set(df[col]))
        vals = list(sorted(vals))

        id2val_dict = {e[0]: e[1] for e in enumerate(vals, 0)}
        print(' > ', col, ':', len(id2val_dict))

        val2id_dict = {v: k for k, v in id2val_dict.items()}
        col_val2id_dict[col] = val2id_dict

        # Replace
        df[col] = df.parallel_apply(replace_attr_with_id,
                                    axis=1,
                                    args=(
                                        col,
                                        val2id_dict,
                                    ))

        dict_DomainDims[col] = len(id2val_dict)

    print(' Feature columns :: ', feature_columns)
    print(' dict_DomainDims ', dict_DomainDims)

    # -------------
    # Save the domain dimensions
    # -------------

    file = 'domain_dims.pkl'
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    f_path = os.path.join(save_dir, file)

    with open(f_path, 'wb') as fh:
        pickle.dump(dict_DomainDims, fh, pickle.HIGHEST_PROTOCOL)

    file = 'col_val2id_dict.pkl'
    f_path = os.path.join(save_dir, file)

    with open(f_path, 'wb') as fh:
        pickle.dump(col_val2id_dict, fh, pickle.HIGHEST_PROTOCOL)

    return df, col_val2id_dict
示例#27
0
def get_features(peptide_list, index_id_list, n_cpu=1):
    if n_cpu != 1:
        pandarallel.initialize(nb_workers=n_cpu, verbose=0)
    index_data = load_index_data(index_id_list=index_id_list)
    peptide_df = pd.DataFrame()
    peptide_df['peptide'] = peptide_list
    if n_cpu == 1:
        features = peptide_df['peptide'].apply(
            lambda x: sequence_to_features(x, index_data))
    else:
        features = peptide_df['peptide'].parallel_apply(
            lambda x: sequence_to_features(x, index_data))
    return pd.DataFrame(features.tolist())
示例#28
0
def cal_cluster(cluster_num=500):
    """
    calculate the bovw clustering centres
    """
    pandarallel.initialize(nb_workers=50, use_memory_fs=False)
    train_list = pd.read_csv('train_list.csv')
    bag_of_features = []
    features = train_list.parallel_apply(cal_descriptors, axis=1)
    for f in features:
        bag_of_features += f
    clusters = kmeans(np.array(bag_of_features).astype('float32'),
                      500,
                      initialization="PLUSPLUS")  # kmean cluster
    return clusters
示例#29
0
 def parse_multiple(self, df, multiproc=False):
     """
     Parses elements and puts them in a dataframe
     :param df: Pandas dataframe
     :param multiproc: Boolean. True activates multiprocessing
     :return:
     """
     if multiproc:
         pandarallel.initialize()
         elements_df = df[self.text_col].parallel_apply(self.collect_parsed)
     else:
         elements_df = df[self.text_col].apply(self.collect_parsed)
     multiple_elements_df = pd.concat([df, elements_df], axis=1)
     return multiple_elements_df
示例#30
0
def main(argv=None):
    """Loads the original corpora, applies normalization and caches the process in csv files."""
    try:
        pandarallel.initialize(progress_bar=True, use_memory_fs=True)
    except SystemError:
        pandarallel.initialize(progress_bar=True)

    OUT_DIR.mkdir(exist_ok=True)

    for corpus in [TIGER, HDT]:
        t0 = time()
        df = get_original_corpus(corpus, print_sample=-50)
        print(f'Writing {FILES[PREPROCESSED](corpus)}')
        df.to_csv(FILES[PREPROCESSED](corpus), sep='\t', index=False)
        print(f'{corpus} done in {time() - t0:.2f}s\n')