예제 #1
0
# blog_parser.py

from util import get_log, bannerfy
from paragraph import Paragraph, Paragraphs, ParagraphsAction
from machine_html_parser import State, Attrs
from machine_html_parser import TransitionData, MachineHTMLParser

from typing import List, Iterable, Tuple, Callable, Optional
from datetime import datetime
from functools import reduce
from pathlib import Path
from logging import DEBUG

log = get_log(__file__, stderr=True)
log.setLevel(DEBUG)

# State Transition Diagram
#
# start ---> metadata <---> author
#                |    <---> date
#                |
#                ---> article <---> subtitle
#                       |
#                       ---> done

valid_transitions: Iterable[Tuple[State, State]] = set([
    ('start', 'metadata'),
    ('metadata', 'title'),
    ('title', 'metadata'),
    ('metadata', 'author_1'),
    ('author_1', 'author_2'),
예제 #2
0
import arrow
import asyncio
import logging
import traceback
import requests
import threading
from concurrent.futures import ThreadPoolExecutor
requests.packages.urllib3.disable_warnings()
dir_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(dir_root)
from exchange.exchange_trade import exchange_trade
from arbitrage.stat_arbitrage import stat_arbitrage
import conf.conf_aliyun
import conf
import util
logger = util.get_log(__name__)


uesrid = 0
ex1_id = 'binance'
ex2_id = 'okex'
symbol = 'EOS/BTC'

ex1 = exchange_trade.create(uesrid, ex1_id)
ex2 = exchange_trade.create(uesrid, ex2_id)
sa = stat_arbitrage(symbol, ex1, ex2)
sa.rebalance_set(True, 0.5)

tasks = sa.add_async_task()

pending = asyncio.Task.all_tasks()
예제 #3
0
from paragraph import Paragraph, Paragraphs, ParagraphsAction
from util import get_log, word_count

import numpy as np  # type: ignore
from typing import Iterable, List, Dict
from datetime import datetime, timedelta
from logging import DEBUG

Samples = List[float]
Label = str
Stats = Dict[Label, Samples]

# logging

log = get_log(__file__, stderr=True, mode='w')  # mode 'w' to overwrite
log.setLevel(DEBUG)


def log_info(paragraph: Paragraph, length: float, time: float):
    id_ = f'{paragraph.filename}|{paragraph.paragraph_title}'
    log.info(f'[length:{length:8.0f}, time:{time:8.2f}] {id_}')


# formatting


class StatsFmt:
    fields: List[str] = ['min', 'max', 'mean', 'std', 'sum']
    width: int = 7
    precision: int = 2
예제 #4
0
# -*- coding: utf-8 -*-
#
# PostGIS support wrapper.
#
# Author: Just van den Broecke
#
from util import get_log

log = get_log("postgis")

try:
    import psycopg2
    import psycopg2.extensions
except ImportError:
    log.error(
        "cannot find package psycopg2 for Postgres client support, please install psycopg2 first!"
    )
    # sys.exit(-1)


class PostGIS:
    def __init__(self, config):
        # Lees de configuratie
        self.config = config

    def connect(self):
        try:
            conn_str = "dbname=%s user=%s host=%s port=%s" % (
                self.config['database'], self.config['user'],
                self.config.get('host',
                                'localhost'), self.config.get('port', '5432'))
예제 #5
0
from paragraph_stats import ParagraphStatsCollector
from util import bannerfy, get_log, input_command, get_new_name
from middlewares import Middlewares, pa_log, pa_sanitize_ws, pa_chunk_long
from middlewares import pa_remove_empty, pa_cat_short, pa_remove_ptag
from es_middleware import ESMiddleware
from es_config import ES_CONFIG, my_analyzer, my_analysis, JsonObject

from elasticsearch import Elasticsearch  # type: ignore

from pprint import pprint, pformat
from glob import glob
from logging import DEBUG
from typing import Dict
from itertools import chain

log = get_log(__file__, stderr=True, mode='w')
log.setLevel(DEBUG)


class BlogIndexConfig(ES_CONFIG):
    index: str = 'site'

    # the only reason this is a property is that it is a bit convoluted to
    # create
    @property
    def mappings(self):
        default_prop: Dict[str, str] = {
            'type': 'text',
            'analyzer': 'my_analyzer'
        }
        properties: Iterable[str] = list(
예제 #6
0
import sys
import os
import util
import argparse
import data_manager
import website_rating_model

logger = util.get_log('website_rating')


def main():
    parser = argparse.ArgumentParser(description='Websites Dynamic Rating')
    parser.add_argument('-b',
                        '--basepath',
                        help='The base path to store the data set',
                        type=str,
                        required=False,
                        default=str(
                            os.path.join(os.path.expanduser('~'), 'dataset')),
                        dest='basepath')
    parser.add_argument('-i',
                        '--init',
                        action='store_true',
                        default=False,
                        dest='init',
                        help='initail environment')
    parser.add_argument('-u',
                        '--pages',
                        action='store_true',
                        default=False,
                        dest='pages',
예제 #7
0
import os
import random
import sqlite3
import sys
import uuid

from util import get_log

log = get_log('critical')


class DataStore(object):

    # MAINTENANCE
    def get_db_path(self):
        node_id = self.node_id
        port_int = self.port
        port_temp = str(port_int)
        port = port_temp[:-1]
        filename = f'abrim_{node_id}_{port}.sqlite'
        try:
            # noinspection PyUnresolvedReferences
            import appdirs
            udd = appdirs.user_data_dir("abrim", "abrim_node")
            db_path = os.path.join(udd, filename)
            if not os.path.exists(udd):
                os.makedirs(udd)
        except ImportError:
            try:
                db_path = f".{os.path.basename(sys.modules['__main__'].__file__)}{filename}"
            except AttributeError:
# get configuration
conf = get_config("configs.yaml")

MAX_CONTENT_LENGTH = conf["config"]["MAX_CONTENT_LENGTH"]

# implement rate limit
limiter = Limiter(app,
                  key_func=get_remote_address,
                  default_limits=[
                      conf["config"]["limiter"]["day"],
                      conf["config"]["limiter"]["hour"],
                      conf["config"]["limiter"]["second"],
                  ])

# get logger
logobj = get_log()


@app.route('/sorting', methods=['POST'])
@limit_content_length(MAX_CONTENT_LENGTH)
def sort_list():
    """
    This route receives a list of int numbers as data, and an order for sorting
    :return: return a sorted list of data consisting of integer numbers
    """
    if not request.json:
        logobj.error("Empty request.json")
        abort(400)

    if not all([request.json.get('data'), request.json.get('order')]):
        logobj.error("KeyError, no data and no order supplied")
예제 #9
0
import os
import pathlib
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

logger = util.get_log('website_rating_model')


class WebsiteRatingModel:
    def __init__(self, trian_container, valid_container):
        if trian_container:
            self.trian_container = trian_container
        else:
            self.trian_container = os.path.join(str(pathlib.Path.home()),
                                                'train')

        if valid_container:
            self.valid_container = valid_container
        else:
            self.valid_container = os.path.join(str(pathlib.Path.home()),
                                                'valid')
예제 #10
0
# -*- coding: utf-8 -*-
#
# PostGIS support wrapper.
#
# Author: Just van den Broecke
#
from util import get_log

log = get_log("postgis")

try:
    import psycopg2
    import psycopg2.extensions
except ImportError:
    log.error("cannot find package psycopg2 for Postgres client support, please install psycopg2 first!")
    # sys.exit(-1)


class PostGIS:
    def __init__(self, config):
        # Lees de configuratie
        self.config = config

    def connect(self):
        try:
            conn_str = "dbname=%s user=%s host=%s port=%s" % (
                self.config["database"],
                self.config["user"],
                self.config.get("host", "localhost"),
                self.config.get("port", "5432"),
            )
예제 #11
0
#
#using selenium to download all website pages.
#

from selenium import webdriver
from bs4 import BeautifulSoup
import time
import os
import util
import numpy as np

logger = util.get_log('data_manager')


class DataManager:
    def __init__(self, base_path):

        if not base_path:
            base_path = str(os.path.join(os.path.expanduser('~'), 'dataset'))

        self.pages_folder_path = os.path.join(base_path, 'pages')
        self.train_path = os.path.join(base_path, 'train')
        self.valid_path = os.path.join(base_path, 'valid')
        self.test_path = os.path.join(base_path, 'test')
        self.urls_folder_path = os.path.join(base_path, 'urls')
        self.map_category_number_to_name = {}
        self.map_category_to_train_number = {}
        self.map_category_to_valid_number = {}
        self.category = 0
        self.train_number = 0
        self.valid_number = 0