示例#1
0
def cache_bottlenecks(sess,
                      image_files_metadata,
                      jpeg_data_tensor,
                      bottleneck_tensor,
                      use_tqdm_notebook_widget=True):
    """Ensures all the training, testing, and validation bottlenecks are cached.

    Because we're likely to read the same image multiple times it can speed things up a lot if we
    calculate the bottleneck layer values once for each image during
    preprocessing, and then just read those cached values repeatedly during
    training. Here we go through all the images we've found, calculate those
    values, and save them off.

    Args:
        sess: The current active TensorFlow Session.
        image_files_metadata: dataframe of training images for each label.
        jpeg_data_tensor: Input tensor for jpeg data from file.
        bottleneck_tensor: The penultimate output layer of the graph.
    """

    # still not sure how robust tqdm is. Maybe will use the 'old code'
    if use_tqdm_notebook_widget:
        tqdm_notebook.pandas(desc='Caching...')
    else:
        tqdm.pandas(desc='Caching...')
    alreadycached_first = image_files_metadata[
        BOTTLENECK_DATAFRAME_KEYWORD].apply(os.path.isfile).sort_values(
            ascending=False
        ).index  # this ensures that first the function will go through files that are already cached, so that the progress bar doesn't jump back and forth between slow and fast mode
    image_files_metadata.loc[alreadycached_first].progress_apply(
        lambda image_files_metadata_row: get_or_create_bottleneck(
            sess, image_files_metadata_row, jpeg_data_tensor, bottleneck_tensor
        ),
        axis=1)
 def textrank(self):
     tqdm.pandas(desc='get textrank>>>')
     row_list = pd.Series(self.split_list).progress_apply(
         lambda x: [[self.dic.token2id[w[0]], w[1]]
                    for w in self.get_textrank(x)])
     #        row_list = row_list.progress_apply(lambda x: [[self.dic.token2id[w[0]], w[1]] for w in x])
     row_list = row_list.tolist()
     self.TEXTRANK_Vector = row_list
示例#3
0
 def run(self):
     if tools.isnotebook():
         tqdm_notebook.pandas(desc="run ilastik")
         _ = self.df.progress_apply(self.process, axis=1)
     else:
         try: 
             _ = self.df.apply(self.process, axis=1)
         except OSError:
             sys.exit(">>> Check ilastik path and config.ini.")
 def predict(
         self,
         X):  # Takes a series of text and returns a series of predictions
     if self.verbose:
         from tqdm._tqdm_notebook import tqdm_notebook
         tqdm_notebook.pandas()
         return X.progress_apply(self.predict_text_main)
     else:
         return X.apply(self.predict_text_main)
def calculate_toxicity(model, test_data):

    batch_size = 1
    max_bert_length = 220
    pytorch_conversion = False

    seed_everything(1235)
    device = torch.device('cpu')
    tqdm.pandas()

    bert_model_path = "./service/uncased_L-12_H-768_A-12/"
    base_tokenizer = BertTokenizer.from_pretrained(bert_model_path,
                                                   cache_dir=None,
                                                   do_lower_case=True)
    converted_text = convert_data(test_data, max_bert_length, base_tokenizer)
    bert_test_lengths = torch.from_numpy(
        np.array([len(x) for x in converted_text]))
    bert_test_set = torch.tensor(pad_sequences(converted_text,
                                               maxlen=max_bert_length,
                                               padding='post'),
                                 dtype=torch.long)

    bert_test_dataset = torch.utils.data.TensorDataset(bert_test_set)
    bert_test_loader = torch.utils.data.DataLoader(bert_test_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=False)
    tk2 = tqdm(enumerate(bert_test_loader),
               total=len(bert_test_loader),
               leave=False)

    output_preds = []
    for i, (batch) in tk2:

        tsrs = trim_tensors(batch)
        x_batch, = tuple(t.to(device) for t in tsrs)
        y_pred = model(x_batch.to(device),
                       attention_mask=(x_batch > 0).to(device),
                       labels=None)
        y_pred = torch.sigmoid(
            torch.tensor(
                y_pred[:,
                       0].detach().cpu().squeeze().numpy())).numpy().ravel()
        list.append(output_preds, y_pred)

    return output_preds
示例#6
0
def load_cache_values(image_files_metadata,
                      notebook=True,
                      tqdm_desc='',
                      project='jax-nihcc-res-00-0011',
                      bucket_name=None,
                      apply_func=lambda x: x.tostring(),
                      user_project=None):
    assert bucket_name is not None, 'please provide bucket name'
    client = storage.Client(project=project)

    if user_project == None:
        user_project = project
    bucket = client.bucket(bucket_name, user_project=user_project)

    if notebook:
        tqdm_notebook.pandas(desc=tqdm_desc)
    else:
        tqdm.pandas(desc=tqdm_desc)

    cache_values = image_files_metadata['rel_path'].progress_apply(
        lambda x: get_blob_val(x, bucket, apply_func))
    return cache_values
示例#7
0
def assign_validation_and_other_labels_to_tiles(
        training_percentage=70,
        testing_percentage=15,
        outputfile=None,
        glob_path=None,
        lstrip_string=None,
        task_class_counts_dict={
            'mean_val': 2,
            'std_val': 2
        },
        relative_path_keyword='rel_path',
        cache_gcs_paths=None,
        use_tqdm_notebook_widget=False,
        include_cache_stats=False,
        glob_locally=False,
        backward_count_to_samplename=3,
        drop_normals=False):
    '''
    Randomly assigns slides to train/test/validation, and creates a dataframe with tile paths
    these corresponding labels. Furthermore it adds some basic annotations such as patient id,
    a random hash, tumor/normal status, and some optional basic statistics (e.g. mean, std)
    of the image. A field with relative path is also construced from each GCSurl for downstream
    analysis.

    NOTE: calculating image statistics has been implemented for caches (tsv files), and needs to
    be implemented for JPEG files (see GH-57).

    Arguments:
    training_percentage (float): Percentage of samples in the training set
    testing_percentage (float): Percentage of samples in the test set
    outputfile (str): File to save the output dataframe. If None it will not save the output
    glob_path (str): glob path only used when locally reading the files. Works if glob_locally=True
        (default:None)
    lstrip_string: Prefix to be removed from GCSurl in order to create relative paths. In most cases
        it can be the bucket name.
    task_class_counts_dict (dict): Number of classes in the optional statistics. This is used
        to split the statistics into percentiles and label each with unique integers (0, 1, ...).
        Default: {'mean_val':2, 'std_val':2}
    relative_path_keyword (str): Column name used for the relative paths constructed from GCSurl
        (default: 'rel_path')
    cache_gcs_paths (str or pd.DataFrame): the list of GCSurls for all the tiles to be annotated. 
        If this is a text file where each row is a GCSurl then the argument would be path of to that text file. 
        Alternatively it can be a dataframe with a similar structure.
    use_tqdm_notebook_widget (bool): flag to make tqdm work with notebooks (default: False)
    include_cache_stats (bool): set this to True in order to calculate tile statistics (e.g. image mean,
        std). Note that this process can be quite time-consuming (default: False).
    glob_locally (bool): The function is able to construct the annotations from local folder structure
        instead of GCS (default: False)

    Returns:
    cache_df (pandas.DataFrame): Dataframe containing GCSurls and their annotations.
    '''

    validation_percentage = 100 - testing_percentage - training_percentage
    # GH-79

    if use_tqdm_notebook_widget:
        tqdm_notebook.pandas(desc='')
    else:
        tqdm.pandas(desc='')

    if glob_locally:
        print('Globbing tiles locally...')
        cache_df = glob.glob(glob_path)
        cache_df = pd.DataFrame(cache_df, columns=[relative_path_keyword])
    else:
        cache_df = util.read_csv(cache_gcs_paths,
                                 columns=['GCSurl'],
                                 sep=',',
                                 header=None)
        cache_df[relative_path_keyword] = cache_df['GCSurl'].progress_apply(
            lambda x: x[len(lstrip_string):])

    print('Randomizing tiles...')
    cache_df = cache_df.sample(frac=1, random_state=0).reset_index(drop=True)

    print('Extracting sample ids...')
    cache_df['sample_id'] = cache_df[relative_path_keyword].progress_apply(
        lambda s: s.split('/')[-backward_count_to_samplename].split('.')[0])

    print('Extracting patient ids...')
    cache_df['patient_id'] = cache_df['sample_id'].progress_apply(
        lambda x: x[:12])

    print('Extracting tumor/normal label...')
    cache_df['is_tumor'] = cache_df['sample_id'].progress_apply(
        lambda s: int(s[13:15]) < 10).astype(int)

    assert validation_percentage + testing_percentage < 100, "There are not enough training samples"

    if drop_normals:
        print('Dropping normal samples from the list...')
        cache_df = cache_df[cache_df['is_tumor'] == 1]

    print('Extracting slide preparation method...')
    cache_df['slide_code'] = cache_df['sample_id'].map(
        lambda x: x.split('-')[-1])
    cache_df['tissue-method'] = cache_df['slide_code'].map(
        lambda x: ['Frozen', 'FFPE'][x.startswith('DX')])

    getSHA1 = lambda s: hashlib.sha1(compat.as_bytes(s)).hexdigest()
    print('Hashing sample ids...')
    cache_df['sample_id_SHA1'] = cache_df['sample_id'].progress_apply(getSHA1)

    assert not cache_df[['sample_id_SHA1', 'sample_id']].drop_duplicates(
    )['sample_id_SHA1'].duplicated().any(), "SHA1 produced duplicates!!!"

    # The 'crossval_group' assigned to tiles may highly correlated training and testing data
    # To avoid that let's re-assign each patient to one category (training,testing,validation):
    print('Assigning cross-validation labels to samples...')
    MAX_NUM_IMAGES_PER_CLASS = 2**27 - 1  # ~134M, need a huge number.
    cache_df['crossval_group'] = cache_df['sample_id_SHA1'].progress_apply(
        lambda x: (int(x, 16) % (MAX_NUM_IMAGES_PER_CLASS + 1)
                   ) / MAX_NUM_IMAGES_PER_CLASS * 100)

    if validation_percentage > 0:
        cache_df['crossval_group'] = pd.cut(
            cache_df['crossval_group'], [
                -1, testing_percentage,
                testing_percentage + validation_percentage, 100
            ],
            labels=['testing', 'validation', 'training'])
    else:  # ignore validation set
        cache_df['crossval_group'] = pd.cut(cache_df['crossval_group'],
                                            [-1, testing_percentage, 100],
                                            labels=['testing', 'training'])

    if include_cache_stats:

        def get_cache_stats(cache_filename):
            x = np.loadtxt(cache_filename, delimiter=',')
            cache_stats = {'mean_val': x.mean(), 'std_val': x.std()}
            return pd.Series(cache_stats)

        print('Calculate per tile statistics...')
        tmp = cache_df[relative_path_keyword].progress_apply(get_cache_stats)

        print('Merging the results...')
        cache_df = pd.concat([cache_df, tmp], axis=1)

        assert tmp.shape[1] == len(
            task_class_counts_dict
        ), "The number of tasks needs to match the number of fields produced by get_cache_stats"

        print('Creating tile stat labels...')
        for label, class_count in task_class_counts_dict.items():
            cache_df[label + '_label'] = pd.qcut(cache_df[label],
                                                 class_count,
                                                 labels=False)

    if outputfile is not None:
        print('Saving tile dataframe to disk...')
        cache_df.to_csv(outputfile, index=False)
        print('Saved to: {:s}'.format(outputfile))
    return cache_df
示例#8
0
def label_cache_files(
        validation_percentage=15,
        testing_percentage=15,
        outputfile='data/cache_dataframe.txt',
        task_class_counts_dict={
            'mean_val': 2,
            'std_val': 2
        },
        glob_path='tcga_tiles/luad/filelist_luad_40x_level2_downsampl16_512x512_cache/*/tiles/tile*.jpg_cached.txt',
        lstrip_string='gs://histology/',
        BOTTLENECK_DATAFRAME_KEYWORD='rel_path',
        cache_gcs_paths='data/filelist_luad_40x_level2_downsampl16_512x512_cache.txt',
        use_tqdm_notebook_widget=True,
        include_cache_stats=False,
        glob_locally=False):

    if use_tqdm_notebook_widget:
        tqdm_notebook.pandas(desc='')
    else:
        tqdm.pandas(desc='')

    if glob_locally:
        print('Globbing tile caches...')
        cache_df = glob.glob(glob_path)
        cache_df = pd.DataFrame(cache_df,
                                columns=[BOTTLENECK_DATAFRAME_KEYWORD])
    else:
        print('Fetching cache filenames...')
        cache_df = pd.read_csv(cache_gcs_paths, sep=',', header=None)
        cache_df.columns = ['GCSurl']
        cache_df[BOTTLENECK_DATAFRAME_KEYWORD] = cache_df[
            'GCSurl'].progress_apply(lambda x: x[len(lstrip_string):])

    print('Randomizing tiles...')
    cache_df = cache_df.sample(frac=1, random_state=0).reset_index(drop=True)

    print('Extracting sample ids...')
    cache_df['sample_id'] = cache_df[
        BOTTLENECK_DATAFRAME_KEYWORD].progress_apply(
            lambda s: s.split('/')[-3].split('.')[0])

    print('Extracting patient ids...')
    cache_df['patient_id'] = cache_df['sample_id'].progress_apply(
        lambda x: x[:12])

    print('Extracting tumor/normal label...')
    cache_df['is_tumor'] = cache_df['sample_id'].progress_apply(
        lambda s: int(s[13:15]) < 10).astype(int)

    assert validation_percentage + testing_percentage < 100, "There are not enough training samples"

    getSHA1 = lambda s: hashlib.sha1(compat.as_bytes(s)).hexdigest()
    print('Hashing sample ids...')
    cache_df['sample_id_SHA1'] = cache_df['sample_id'].progress_apply(getSHA1)

    assert not cache_df[['sample_id_SHA1', 'sample_id']].drop_duplicates(
    )['sample_id_SHA1'].duplicated().any(), "SHA1 produced duplicates!!!"

    # The 'crossval_group' assigned to tiles may highly correlated training and testing data
    # To avoid that let's re-assign each patient to one category (training,testing,validation):
    print('Assigning cross-validation labels...')
    MAX_NUM_IMAGES_PER_CLASS = 2**27 - 1  # ~134M
    cache_df['crossval_group'] = cache_df['sample_id_SHA1'].progress_apply(
        lambda x: (int(x, 16) % (MAX_NUM_IMAGES_PER_CLASS + 1)
                   ) / MAX_NUM_IMAGES_PER_CLASS * 100)

    if validation_percentage > 0:
        cache_df['crossval_group'] = pd.cut(
            cache_df['crossval_group'], [
                -1, testing_percentage,
                testing_percentage + validation_percentage, 100
            ],
            labels=['testing', 'validation', 'training'])
    else:  # ignore validation set
        cache_df['crossval_group'] = pd.cut(cache_df['crossval_group'],
                                            [-1, testing_percentage, 100],
                                            labels=['testing', 'training'])

    if include_cache_stats:

        def get_cache_stats(cache_filename):
            x = np.loadtxt(cache_filename, delimiter=',')
            cache_stats = {'mean_val': x.mean(), 'std_val': x.std()}
            return pd.Series(cache_stats)

        print('Calculate per cache statistics...')
        tmp = cache_df[BOTTLENECK_DATAFRAME_KEYWORD].progress_apply(
            get_cache_stats)

        print('Merging the results...')
        cache_df = pd.concat([cache_df, tmp], axis=1)

        assert tmp.shape[1] == len(
            task_class_counts_dict
        ), "The number of tasks needs to match the number of fields produced by get_cache_stats"

        print('Creating cache stat labels...')
        for label, class_count in task_class_counts_dict.items():
            cache_df[label + '_label'] = pd.qcut(cache_df[label],
                                                 class_count,
                                                 labels=False)

    print('Saving cache dataframe to disk...')
    cache_df.to_csv(outputfile, index=False)
    print('Saved to: {:s}'.format(outputfile))
    return cache_df
示例#9
0
def label_jpeg_files(
        task_class_counts_dict,
        validation_percentage,
        testing_percentage,
        glob_path='tcga_tiles/luad_40x_level_2_16/TCGA-*.svs/tiles/tile_*.jpg',
        cache_directory='tcga_tiles/luad_40x_level_2_16_cache',
        BOTTLENECK_DATAFRAME_KEYWORD='rel_path',
        IMAGE_DATAFRAME_KEYWORD='image_filename',
        use_tqdm_notebook_widget=True,
        label_image_stats=True):

    image_filenames = glob.glob(glob_path)
    image_files_metadata = pd.DataFrame(image_filenames,
                                        columns=[IMAGE_DATAFRAME_KEYWORD])
    image_files_metadata['sample_id'] = image_files_metadata[
        'image_filename'].map(lambda s: s.split('/')[-3].split('.')[0])
    image_files_metadata['is_tumor'] = image_files_metadata['sample_id'].apply(
        lambda s: int(s[13:15]) < 10).astype(int)

    strip_shared_path = image_files_metadata['image_filename'].str.split(
        '/').apply(pd.Series)
    idx = np.where(strip_shared_path.apply(pd.Series.nunique) != 1)[0][0]
    image_files_metadata[
        BOTTLENECK_DATAFRAME_KEYWORD] = strip_shared_path.iloc[:, idx:].apply(
            lambda s: os.path.join(cache_directory, '/'.join(s) + '_cached.txt'
                                   ),
            axis=1)
    image_files_metadata = image_files_metadata.sample(
        frac=1, random_state=0).reset_index(drop=True)
    assert validation_percentage + testing_percentage < 100, "There are not enough training samples"
    getSHA1 = lambda s: hashlib.sha1(compat.as_bytes(s)).hexdigest()
    image_files_metadata['sample_id_SHA1'] = image_files_metadata[
        'sample_id'].map(getSHA1)
    assert not image_files_metadata[[
        'sample_id_SHA1', 'sample_id'
    ]].drop_duplicates()['sample_id_SHA1'].duplicated().any(
    ), "SHA1 produced duplicates!!!"

    # The 'crossval_group' assigned to tiles may highly correlated training and testing data
    # To avoid that let's re-assign each patient to one category (training,testing,validation):
    MAX_NUM_IMAGES_PER_CLASS = 2**27 - 1  # ~134M
    image_files_metadata['crossval_group'] = image_files_metadata[
        'sample_id_SHA1'].apply(lambda x: (int(x, 16) % (
            MAX_NUM_IMAGES_PER_CLASS + 1)) / MAX_NUM_IMAGES_PER_CLASS * 100)

    if validation_percentage > 0:
        image_files_metadata['crossval_group'] = pd.cut(
            image_files_metadata['crossval_group'], [
                -1, testing_percentage,
                testing_percentage + validation_percentage, 100
            ],
            labels=['testing', 'validation', 'training'])
    else:
        image_files_metadata['crossval_group'] = pd.cut(
            image_files_metadata['crossval_group'],
            [-1, testing_percentage, 100],
            labels=['testing', 'training'])

    if use_tqdm_notebook_widget:
        tqdm_notebook.pandas(desc='Labeling...')
    else:
        tqdm.pandas(desc='Labeling...')

    def read_tile_and_calculate_intensity_stats(x):
        try:
            x_array = plt.imread(x)
        except (OSError, TypeError):
            print(
                '\nThe following file seems to be corrupted: {:s}\n'.format(x))
            x_array = np.array([np.nan])
        return calculate_intensity_stats(x_array)

    temp = image_files_metadata[IMAGE_DATAFRAME_KEYWORD].progress_apply(
        read_tile_and_calculate_intensity_stats)
    assert temp.shape[1] == len(
        task_class_counts_dict
    ), "The number of tasks needs to match the number of fields produced by calculate_intensity_stats"
    image_files_metadata = pd.concat([image_files_metadata, temp], axis=1)
    for label, class_count in task_class_counts_dict.items():
        image_files_metadata[label + '_label'] = pd.qcut(
            image_files_metadata[label], class_count, labels=False)
    return image_files_metadata
示例#10
0
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import KFold, BaseCrossValidator
from sklearn.decomposition import TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection import SparseRandomProjection
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import Ridge
from sklearn.preprocessing import scale
from scipy.stats import skew, kurtosis, gmean, ks_2samp
import gc
import psutil
from tqdm._tqdm_notebook import tqdm_notebook as tqdm

tqdm.pandas()
sns.set(style="white", color_codes=True)

# In[ ]:

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train.head()

# # Feature engineering

# We will start by defining basic row aggregation features. These are used in most public kernels so I will not further elaborate on this part.

# In[ ]:

示例#11
0
from collections import Counter

import tensorflow as tf
from common.MultiVectorizer import *
import pandas as pd
from tensorflow.keras.layers import Dense, Embedding, Input, LSTM, TimeDistributed, SpatialDropout1D, Conv1D, MaxPooling1D, Dropout, AdditiveAttention, Attention, \
    GlobalAveragePooling1D, Concatenate, Bidirectional
from tensorflow.keras.models import Model
from common.data_utils import *
from tensorflow.keras.callbacks import Callback
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm._tqdm_notebook import tqdm_notebook
from tensorflow.keras.metrics import *
tqdm_notebook.pandas()


class AutoEncoderTextModel():
    def __init__(self, vectorizer=None, load_weights=False):
        self.vectorizer = vectorizer
        self.load_weights = load_weights
        self.METRICS = [
            BinaryAccuracy(name='accuracy'),
            Precision(name='precision'),
            Recall(name='recall'),
            AUC(name='auc')
        ]

    def load_data(self, file_path, rows=None, validation_split=None):
        data_df = pd.read_excel(file_path, nrows=rows)

        if validation_split is not None:
示例#12
0
    for i in tqdm_notebook(range(dim_0)):
        x[i] = mat[i:TIME_STEPS+i]
        y[i] = mat[TIME_STEPS+i, y_col_index]
#         if i < 10:
#           print(i,"-->", x[i,-1,:], y[i])
    print("length of time-series i/o",x.shape,y.shape)
    return x, y


stime = time.time()
print(os.listdir(INPUT_PATH))
df_ge = pd.read_csv(os.path.join(INPUT_PATH, "ge.us.txt"), engine='python')
print(df_ge.shape)
print(df_ge.columns)
display(df_ge.head(5))
tqdm_notebook.pandas('Processing...')
# df_ge = process_dataframe(df_ge)
print(df_ge.dtypes)
train_cols = ["Open","High","Low","Close","Volume"]
df_train, df_test = train_test_split(df_ge, train_size=0.8, test_size=0.2, shuffle=False)
print("Train--Test size", len(df_train), len(df_test))

# scale the feature MinMax, build array
x = df_train.loc[:,train_cols].values
min_max_scaler = MinMaxScaler()
x_train = min_max_scaler.fit_transform(x)
x_test = min_max_scaler.transform(df_test.loc[:,train_cols])

print("Deleting unused dataframes of total size(KB)",(sys.getsizeof(df_ge)+sys.getsizeof(df_train)+sys.getsizeof(df_test))//1024)

del df_ge
示例#13
0
import numpy as np
import pandas as pd
import os
import matplotlib.pylab as plt
# plt.style.use("fivethirtyeight")
plt.style.use('ggplot')  #选择绘图风格,就是能好看一点!
import seaborn as sns  #类似matplotlib的画图包
import gc  #gc.collect(),显式回收内存,见②

sns.set(style="ticks", color_codes=True)  #设置画图空间为 Seaborn 默认风格
import matplotlib.pyplot as plt
from tqdm._tqdm_notebook import tqdm_notebook as tqdm  #③

tqdm.pandas()  #③
import datetime
#关于plotly库④
#import plotly.offline as ply
import ply
#ply.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import warnings

warnings.filterwarnings('ignore')  #不显示warning

#functions

#pictures
# Read in the dataframes
import pandas as pd
import gc
示例#14
0
# libs for visualization
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import plotly.offline as py
import plotly.graph_objs as go
sns.set_style('whitegrid')

# libraries
import logging.config
from pymongo import MongoClient

import tqdm
from tqdm._tqdm_notebook import tqdm_notebook as tn
tn.pandas()

mc = MongoClient('mongodb://*****:*****@IP')


def tgn(msg: str, alarmer_keys=None):
    s = Session()
    if alarmer_keys is None:
        alarmer_keys = ['YOUR_TOKEN_FROM_t.me/alarmer_bot']
    parts = msg.split('\n\n')
    for ak in alarmer_keys:
        for part in parts:
            s.get('https://alarmerbot.ru/', params={'key': ak, 'message': part})


def set_logging(level="DEBUG", formatting=None, disable_existing=False, console=True, file=False, path=None):
示例#15
0
import re
import pandas as pd
from janome.tokenizer import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas(desc="hoge progress: ")

def delete_brackets(s):
    """
    括弧と括弧内文字列を削除
    """
    """ brackets to zenkaku """
    table = {
        "(": "(",
        ")": ")",
        "<": "<",
        ">": ">",
        "{": "{",
        "}": "}",
        "[": "[",
        "]": "]"
    }
    for key in table.keys():
        s = s.replace(key, table[key])
    """ delete zenkaku_brackets """
    l = ['([^(|^)]*)', '【[^【|^】]*】', '<[^<|^>]*>', '[[^[|^]]*]',
         '「[^「|^」]*」', '{[^{|^}]*}', '〔[^〔|^〕]*〕', '〈[^〈|^〉]*〉']
    for l_ in l:
        s = re.sub(l_, "", s)
    """ recursive processing """