def get_occurrences(url: hug.types.text, key_word: hug.types.text, case_sensitive=True): """ Returns JSON containing number of occurrences of "key_word" in website given by "url" with case sensitivity defined by optional argument case_sensitive (True by default) """ # get and decode web content content = requests.get(url).content html_text = content.decode('utf-8') # remove Javascript, CSS and HTML from bs4 import BeautifulSoup soup = BeautifulSoup(html_text, "lxml") for script in soup(["script", "style"]): script.extract() text = soup.get_text() # convert text and keyword to lowercase if case_sensitive = False if not case_sensitive: text = text.lower() key_word = key_word.lower() # split text into iterable list of words without punctuation text = re.split(r",|;|:|\W", text) # count keyword occurrences counter = 0 for word in text: if word == key_word: counter += 1 return ('{{"{0}": "{1}"}}'.format("Occurrences", counter))
def suggestion(text: hug.types.text): """ For a given text, return possible terms from a suggest list in elastic search index example query: http://localhost:8888/api/suggestion?text=department """ suggest_body = { "suggest": { "field-suggest": { "prefix": text, "completion": { "field": "suggest" } } } } responses = es.search(index='penn-events', body=suggest_body) # return all possible full term from suggest list suggest_terms = [] for response in responses['suggest']['field-suggest'][0]['options']: for s in response['_source']['suggest']: if text.lower() in s.lower(): suggest_terms.append(s) return list(pd.unique(suggest_terms))
def _add_one_user(db: directives.PeeweeSession, username: hug.types.text, password: hug.types.text = None, role: hug.types.one_of(UserRoles.user_roles()) = UserRoles.USER, coupons: hug.types.number = 10): with db.atomic(): name = username.lower() salt = get_random_string(2) secret_password = password or get_random_string(12) hashed_password = hash_pw(name, salt, secret_password) user = User.create(user_name=name, role=role, salt=salt, password=hashed_password, coupons=coupons) user.save() return {"name": user.user_name, "password": secret_password}
def get_sim_score(seq1: hug.types.text, seq2: hug.types.text, method: hug.types.text = 'levenshtein', response=None): """ Compare Similarity between sequences Args: Two Sequences for which similarity score needs to be calculated, and the method used to calculate the score. Available methods are levenshtein (default), jaccard, jaro-winkler, hamming and sequencer-matcher Returns: Scaled Score between 0.0 and 1.0 with 0.0 - Sequences are not similar at all 1.0 - Sequences are completely similar (case-insensitive) """ logger.info("Method: %s", method) logger.info('Seq1: %s Seq2: %s', seq1, seq2) text = [seq1, seq2] method = method.lower() simscore = SimilarityMetric(text) if method == 'levenshtein': similarity = simscore.levenshtein() return {'sim_score': round(similarity, 4)} elif method == 'jaccard': similarity = simscore.jaccard() return {'sim_score': round(similarity, 4)} elif method == 'jaro-winkler': similarity = simscore.jaro_winkler() return {'sim_score': round(similarity, 4)} elif method == 'hamming': similarity = simscore.hamming() return {'sim_score': round(similarity, 4)} elif method == 'sequencer-matcher': similarity = simscore.sequencer_matcher() return {'sim_score': round(similarity, 4)} else: response.status = falcon.HTTP_400 return { 'error': 'Unsupported method. Supported method types are Levenshtein, Jaccard, Jaro-Winkler, Hamming, Sequence-Matcher' }
def change_user_pw(db: directives.PeeweeSession, username: hug.types.text, password: hug.types.text, for_real: hug.types.smart_boolean = False): if not for_real: print( f"this would change {username}'s pw to {password}. Run with --for_real if you're sure.") sys.exit(1) with db.atomic(): name = username.lower() salt = get_random_string(2) secret_password = password hashed_password = hash_pw(name, salt, secret_password) user = User.get(User.user_name == username) user.salt = salt user.password = hashed_password user.save() print(f"{user.user_name}'s pw successfully changed.")
def put_user(db: PeeweeSession, newUserName: hug.types.text, newUserPassword: hug.types.text, newUserPasswordConfirm: hug.types.text): if newUserPassword != newUserPasswordConfirm: raise hug.HTTPBadRequest with db.atomic(): try: name = newUserName.lower() salt = get_random_string(2) secret_password = newUserPassword hashed_password = hash_pw(name, salt, secret_password) user = User.create(user_name=name, role=UserRoles.USER, salt=salt, password=hashed_password, coupons=10) user.save() return { "username": user.user_name } except IntegrityError: raise hug.HTTPConflict('User already exists.')
def search(conn: directive.connection, tables: directive.tables, locale: directive.locale, query: hug.types.text, limit: hug.types.in_range(1, 100) = 20, page: hug.types.in_range(1, 10) = 1): """ Search a route by name. `query` contains the string to search for. _limit_ ist the maximum number of results to return. _page_ the batch number of results to return, i.e. the requests returns results `[(page - 1) * limit, page * limit[`. """ maxresults = page * limit res = RouteList(query=query, page=page) r = tables.routes.data base = sa.select(RouteItem.make_selectables(r)) # First try: exact match of ref sql = base.where( sa.func.lower(r.c.ref) == query.lower()).limit(maxresults + 1) res.set_items(conn.execute(sql), locale) # If that did not work and the search term is a number, maybe a relation # number? if len(res) == 0 and len(query) > 3 and query.isdigit(): sql = base.where(r.c.id == int(query)) res.set_items(conn.execute(sql), locale) if len(res) > 0: return res # Second try: fuzzy matching of text if len(res) <= maxresults: remain = maxresults - len(res) # Preselect matches by doing a word match on name and intnames. primary_sim = r.c.name + sa.func.jsonb_path_query_array( r.c.intnames, '$.*', type_=sa.Text) primary_sim = primary_sim.op('<->>>', return_type=sa.Float)(query) primary_sim = primary_sim.label('sim') # Rerank by full match against main name second_sim = r.c.name.op('<->', return_type=sa.Float)(query) second_sim = second_sim.label('secsim') inner = base.add_columns(primary_sim, second_sim)\ .order_by(primary_sim)\ .limit(min(1100, remain * 10))\ .alias('inner') # Rerank by full match against main name rematch_sim = (inner.c.sim + inner.c.secsim).label('finsim') sql = sa.select(inner.c)\ .add_columns(rematch_sim)\ .order_by(rematch_sim)\ .limit(remain) minsim = None for o in conn.execute(sql): if minsim is None: minsim = o['finsim'] elif o['finsim'] - 0.3 > minsim: break res.add_item(o, locale) if page > 1: res.drop_leading_results((page - 1) * limit) return res
def synthesize_data(query: hug.types.text, method: hug.types.text): if query_ok(query): parsed = sqlparse.parse(query)[0] order_found = False order_clauses = [] limit_found = False if parsed.get_type() == 'SELECT': for t in parsed.tokens: if(t.is_whitespace): continue if (t.is_keyword and t.normalized == 'ORDER'): order_found = True continue if order_found: if t.is_keyword and t.normalized != 'BY': break elif isinstance(t, (sqlparse.sql.Identifier, sqlparse.sql.IdentifierList)): order_clauses.append(str(t)) for t in parsed.tokens: if (t.is_keyword and t.normalized == 'LIMIT'): limit_found = True # replace order by clauses with random() # as order by doesn't do anything once synthesis occurs fixed_query = query if order_found: i = query.rfind(order_clauses[0]) fixed_query = fixed_query[:i] + "random()" + fixed_query[i + len(order_clauses[0]):] for o in order_clauses[1:]: i = fixed_query.rfind(o) fixed_query = fixed_query[:i] + fixed_query[i + len(o):] # some cleanup fixed_query = re.sub('random\(\),', 'random()', fixed_query, flags=re.M) fixed_query = re.sub('^\s+,', '', fixed_query, flags=re.M) # if no order by statement present, add it else: if limit_found: i = fixed_query.lower().rfind('limit') fixed_query = fixed_query[:i] + "\norder by random()\n" + fixed_query[i:] else: fixed_query += '\norder by random()' try: if method is not None: for m in kfpd.synthesis_methods: if method.lower() == m.lower(): kfpd.plugin = globals()[m + 'Plugin']() df = kfpd.read_sql(fixed_query, db_conn) # if any order by clauses were present, re-apply them if len(order_clauses) > 0: sort_by = [] asc_flags = [] orig_columns = df.columns df.columns = df.columns.str.lower() for o in order_clauses: sub_o = o.split(',') # If you don’t specify the ASC or DESC keyword, SQLite uses ASC or ascending order by default. for s in sub_o: if s.lower().find(' desc') != -1: asc_flags.append(False) else: asc_flags.append(True) sort_by.append(re.sub('\s+asc|\s+desc', '', s, flags=re.IGNORECASE).strip().lower()) df.sort_values(sort_by, ascending=asc_flags, inplace=True) df.columns = orig_columns df_html = ( df.style .hide_index() .set_table_attributes("class='table table-hover'") .set_uuid('_') .render() ) # pandas generated html has a lot of stuff we don't want returned # chuck it! df_html = re.sub(' id="T__row\d+_col\d+"', '', df_html) df_html = re.sub(' class="data row\d+ col\d+" ', '', df_html) return { 'message': 'success', 'query': '{0}'.format(query), 'executed_query': fixed_query, 'response': df_html, 'csv': df.to_csv(index=False)} except Exception as e: print('web-service.synthesize_data() caught exception', str(e)) return { 'message': 'error', 'query': '{0}'.format(query), 'response': str(e)} else: return { 'message': 'error', 'query': '{0}'.format(query), 'response': 'Invalid query provided.'}
def ip_bulk_by_category(category: hug.types.text): """Retrieve all IP addresses that are in feeds by feed category""" category_lower = category.lower() return FeedsAlchemy.db_ip_bulk_by_category(category_lower)
def maintainers_by_category(category: hug.types.text): """Retrieve all maintainers by category""" category = category.lower() return FeedsAlchemy.db_maintainers_by_category(category)
def maintainer_info(maintainer: hug.types.text): """Retrieve all available information about the maintainer by its name""" maintainer_lower = maintainer.lower() return FeedsAlchemy.db_maintainer_info(maintainer_lower)
def feed_info(feed_name: hug.types.text): """Retrieve all available information about the feed by its name""" feed_name_lower = feed_name.lower() return FeedsAlchemy.db_feed_info(feed_name_lower)