Пример #1
0
class DataHandler:
    TOP_TERMS_PER_CLUSTER = config.get_env("DEFAULT_TOP_TERMS_PER_CLUSTER")

    HAS_MULTIPLE_DATA_SOURCES = False
    DATA_SOURCE = None

    SHUFFLE_DATA = True
    PRE_LOAD_UUID = None

    def __init__(self, name):
        self.name = name
        log.info(f'{name} Data Loaded')
        self.df = None
        self.saved_item_to_cluster = None

    def display_labels(self):
        pass

    def meta_info(self):
        return [{"content": ''}] * self.df.shape[0]

    def item_to_cluster(self):
        return self.saved_item_to_cluster

    def clean_up_df_text(self,
                         col,
                         language="english",
                         clean_up_method="nltk"):
        return clean_up_text(self.df, col, language, clean_up_method)

    def calculate_n_clusters(self):
        return calculate_n_clusters_by_category(self.df.shape[0])['medium'][1]
def clean_up_text(df, column_name, language, clean_up_method):
    log.info('Starting Text Cleanup')
    should_use_single_processing = config.get_env("PROCESSES_NUMBER") < 2
    log.info(f'Using {clean_up_method}. Language={language}')

    if clean_up_method == "nltk":
        class_to_use = NltkTextCleaner(language)
    elif clean_up_method == "spacy":
        if language not in ['english', 'german']:
            log.warn(f'SpaCy does not support {language}')
            return
        class_to_use = SpacyTextCleaner(language)
    else:
        log.warn(f'{clean_up_method} not found')
        return

    if should_use_single_processing:
        return df[column_name].apply(
            lambda x: class_to_use.tokenizer(x)).tolist()
    start_time = time.time()
    with mp.Pool() as pool:
        result = pool.map(class_to_use.tokenizer, df[column_name])
    log.info("Finished Text Clean up after %s seconds" %
             (time.time() - start_time))
    return result
Пример #3
0
    def calculate(self, stopwords=None):
        self.vectorizer = TfidfVectorizer(stop_words=stopwords,
                                          max_df=0.8,
                                          tokenizer=identity_func(stopwords),
                                          lowercase=False)
        self.tfidf_matrix = self.vectorizer.fit_transform(self.documents)

        self.model = KMeans(n_clusters=self.n_clusters,
                            init='k-means++',
                            max_iter=self.max_iteration,
                            n_init=1,
                            n_jobs=config.get_env("PROCESSES_NUMBER"))
        self.model.fit(self.tfidf_matrix)
Пример #4
0
 def __init__(self, documents, n_clusters, top_terms_per_cluster):
     self.documents = documents
     self.top_terms_per_cluster = top_terms_per_cluster
     self.vectorizer = CountVectorizer(max_df=0.8,
                                       min_df=2,
                                       stop_words=None,
                                       tokenizer=identity_func(None),
                                       lowercase=False)
     self.tfidf = self.vectorizer.fit_transform(documents)
     self.features = self.vectorizer.get_feature_names()
     self.clf = LatentDirichletAllocation(
         n_components=n_clusters,
         n_jobs=config.get_env("PROCESSES_NUMBER"),
         random_state=0).fit(self.tfidf)
     self.topics()
Пример #5
0
def configure_app(app):
    app.config.from_object(get_config())
    app.config.from_pyfile(get_env())
    return app
Пример #6
0
             "origins": [
                 "http://localhost:8080", "https://pandermatt.ch",
                 "https://kenspace.ch"
             ]
         },
     })
authorizations = {
    'Bearer Auth': {
        'type': 'apiKey',
        'in': 'header',
        'name': 'Authorization'
    },
}

swagger_ui_enabled = '/'
if config.get_env('PRODUCTION') == 'Y':
    swagger_ui_enabled = False

api = Api(app,
          version='0.1.0',
          title='KenSpace API',
          description='API for KenSpace',
          security='Bearer Auth',
          authorizations=authorizations,
          doc=swagger_ui_enabled)

queries = api.namespace('queries', description='Query operations')
auth = api.namespace('auth', description='Authentication')
feedback = api.namespace('feedback', description='Submit Feedback')
upload = api.namespace('upload', description='Upload Data')
Пример #7
0
def verify_token(token):
    try:
        return token in config.get_env('AUTH_KEY')
    except RuntimeError:
        token_auth_error()