示例#1
0
    def __connect(self, wsURL):
        '''Connect to the websocket in a thread.'''
        self.logger.debug("Starting thread")

        ssl_defaults = ssl.get_default_verify_paths()
        sslopt_ca_certs = {'ca_certs': ssl_defaults.cafile}
        self.ws = websocket.WebSocketApp(wsURL,
                                         on_message=self.__on_message,
                                         on_close=self.__on_close,
                                         on_open=self.__on_open,
                                         on_error=self.__on_error,
                                         header=self.__get_auth())

        setup_custom_logger('websocket', log_level=settings.LOG_LEVEL)
        self.wst = threading.Thread(
            target=lambda: self.ws.run_forever(sslopt=sslopt_ca_certs))
        self.wst.daemon = True
        self.wst.start()
        self.logger.info("Started thread")

        # Wait for connect before continuing
        conn_timeout = 5
        while (not self.ws.sock or not self.ws.sock.connected
               ) and conn_timeout and not self._error:
            sleep(1)
            conn_timeout -= 1

        if not conn_timeout or self._error:
            self.logger.error("Couldn't connect to WS! Exiting.")
            self.exit()
            sys.exit(1)
def main(options):
    global logger
    logger = setup_custom_logger('root', filename=options.log_file)

    scraper = create_scraper_by_type(options.conference)
    papers = scraper.scrape_list_of_papers(options.conference_program_url)

    work_data = []
    for date, papers_by_date in papers.iteritems():
        ensure_dir(os.path.join(options.destination_dir, date))
        for session, papers_by_session in papers_by_date.iteritems():
            ensure_dir(os.path.join(options.destination_dir, date, session))
            for section, papers_by_section in papers_by_session.iteritems():
                base_dir = os.path.join(options.destination_dir, date, session,
                                        section)
                ensure_dir(base_dir)
                if not os.listdir(base_dir):
                    logger.info(
                        'Directory is empty {}, will process it'.format(
                            base_dir))
                    work_data.append((base_dir, papers_by_section))
                else:
                    logger.info(
                        'Something is in {}, skipping it'.format(base_dir))

    # download_papers(work_data[0])

    pool = mp.Pool(8)
    for results in pool.imap_unordered(download_papers, work_data):
        for paper_name, paper_file in results:
            print paper_name
            print '  ' + paper_file
    pool.close()
    pool.join()
def download_papers(data):
    global logger
    logger = logging.getLogger('root')
    if not logger.handlers:
        logger = setup_custom_logger('root')
    base_dir, papers_by_section = data
    gd = GoogleDownloader()
    results = []
    for paper in papers_by_section:
        try:
            print paper
            folder = '{authors}_{paper-name}'.format(**paper)
            paper_dir = os.path.join(base_dir, folder)
            ensure_dir(paper_dir)
            possible_pdfs = gd.search_paper_file(paper['paper-name'])
            existing_hashes = [
                hashfile(open(os.path.join(paper_dir, filename), 'rb'),
                         hashlib.md5()) for filename in os.listdir(paper_dir)
            ]
            for link_text, pdf_url in possible_pdfs:
                filename = os.path.join(paper_dir, link_text + '.pdf')
                i = 1
                while os.path.exists(filename):
                    i += 1
                    filename = os.path.join(paper_dir,
                                            link_text + str(i) + '.pdf')
                print link_text
                logger.info(
                    'Downloading search result "{}" with URL {} to file "{}"'.
                    format(link_text, pdf_url, filename))
                response = requests.get(pdf_url, stream=True, verify=False)
                with open(filename, 'wb') as out_file:
                    shutil.copyfileobj(response.raw, out_file)
                with open(filename, 'rb') as downloaded_file:
                    file_hash = hashfile(downloaded_file, hashlib.md5())
                if file_hash in existing_hashes:
                    os.unlink(filename)
                    logger.info(
                        'Hash of downloaded file is {} '
                        'and we already have this file; removing it'.format(
                            file_hash))
                else:
                    existing_hashes.append(file_hash)
                    logger.info('Hash of downloaded file is {}, '
                                'adding it to folder'.format(file_hash))
                    results.append((folder, filename))
        except Exception as e:
            logger.exception('Download of paper "{}" failed'.format(folder))
    return results
import json
import re

import pandas as pd
import tqdm

from utils import log
from datasets.preprocess_utils import download_raw_and_preprocess

logger = log.setup_custom_logger(__name__)


REPLACE_TOKS = [
    ("#39;", "'"),
    ("#36;", "$"),
    (">", ">"),
    ("&lt;", "<"),
    ("\\$", "$"),
    ("quot;", "\""),
    ("\\", " "),
    ("#145;", "\""),
    ("#146;", "\""),
    ("#151;", "-")
]


def preprocess_ag_data(input_filename, output_filename,
                       include_title=True, include_author_media=True):
    """preprocess raw AG's news csv to Fibber's JSON format."""
    logger.info("Start preprocessing data, and save at %s.", output_filename)
    df = pd.read_csv(input_filename, header=None)
示例#5
0
            try:  # If it not a class skip it
                site_class = getattr(sys.modules[__name__], site.lower())
            except AttributeError as e:
                print("\nThere is no module named " + site + "\n")
                continue
            dl_path = os.path.expanduser(config[site]['download_path'])
            # Create dl path if not there
            try:
                os.makedirs(dl_path)
            except Exception as e:
                pass
            num_files = int(config[site]['number_of_files'])
            progress_file = config[site]['progress_file'].lower()
            threads = int(config[site]['threads'])
            log_file = os.path.join(dl_path, site + '.log')
            logger = setup_custom_logger('root', log_file)
            try:
                search = config[site]['search'].split(',')
            except KeyError as e:
                search = []
            if search:
                for term in search:
                    site_term = site + ":" + term
                    scrape[site_term] = Process(site_class, dl_path,
                                                progress_file, term, num_files,
                                                threads)
            else:
                scrape[site] = Process(site_class, dl_path, progress_file, '',
                                       num_files, threads)

    # Start site parser
示例#6
0
from utils.log import setup_custom_logger

logger = setup_custom_logger('root')

class TimeIntervalError(Exception):
    def __init__(self):
        logger.error('TimeIntervalError')

    def __str__(self):
        return 'TimeIntervalError: end time should 1h after the start time'


class BinanceAPIException(Exception):

    def __init__(self, response):
        self.code = 0
        try:
            json_res = response.json()
        except ValueError:
            self.message = 'Invalid JSON error message from Binance: {}'.format(response.text)
        else:
            self.code = json_res['code']
            self.message = json_res['msg']
        logger.error(self.message)
        self.status_code = response.status_code
        self.response = response
        self.request = getattr(response, 'request', None)

    def __str__(self):  # pragma: no cover
        return 'APIError(code=%s): %s' % (self.code, self.message)
"""This metric computes the cosine similarity between two sentences. The sentence embedding is
the sum of GloVe word embeddings."""

import numpy as np
from nltk import word_tokenize

from utils import log
from metrics.metric_base import MetricBase
from utils import get_glove_emb, get_nltk_data, get_stopwords

logger = log.setup_custom_logger('glove_semantic_similairty')


def compute_emb(emb_table, tok_to_id, x):
    """Compute the sum of word embeddings for a sentence.

    Args:
        emb_table (np.array): the glove embedding table.
        tok_to_id (dict): a dict mapping strs to ints.
        x (str): text.

    Returns:
        (np.array): the sum of word embedding.
    """
    toks = word_tokenize(x)
    embs = []
    for item in toks:
        if item.lower() in tok_to_id:
            embs.append(emb_table[tok_to_id[item.lower()]])
    return np.sum(embs, axis=0)
import atexit
import signal

import bitmex
from settings import settings
from utils import log, constants, errors, math

# Used for reloading the bot - saves modified times of key files
import os

watched_files_mtimes = [(f, getmtime(f)) for f in settings.WATCHED_FILES]

#
# Helpers
#
logger = log.setup_custom_logger('root')


class ExchangeInterface:
    def __init__(self, dry_run=False):
        self.dry_run = dry_run
        if len(sys.argv) > 1:
            self.symbol = sys.argv[1]
        else:
            self.symbol = settings.SYMBOL
        self.bitmex = bitmex.BitMEX(base_url=settings.BASE_URL,
                                    symbol=self.symbol,
                                    apiKey=settings.API_KEY,
                                    apiSecret=settings.API_SECRET,
                                    orderIDPrefix=settings.ORDERID_PREFIX,
                                    postOnly=settings.POST_ONLY,
示例#9
0
        if config[site]['enabled'].lower() == 'true':
            try:  # If it not a class skip it
                site_class = getattr(sys.modules[__name__], site.lower())
            except AttributeError as e:
                print("\nThere is no module named " + site + "\n")
                continue
            dl_path = os.path.expanduser(config[site]['download_path'])
            # Create dl path if not there
            try:
                os.makedirs(dl_path)
            except Exception as e:
                pass
            num_files = int(config[site]['number_of_files'])
            threads = int(config[site]['threads'])
            log_file = os.path.join(dl_path, site + '.log')
            logger = setup_custom_logger('root', log_file)
            try:
                search = config[site]['search'].split(',')
            except KeyError as e:
                search = []
            if search:
                for term in search:
                    site_term = site + ":" + term
                    scrape[site_term] = Process(site_class, dl_path, term, num_files, threads)
            else:
                scrape[site] = Process(site_class, dl_path, '', num_files, threads)

    # Start site parser
    try:
        for site in scrape:
            print("#### Scrapeing: " + site)
示例#10
0
        sys.exit(0)
    config.read(config_file)

    # Read scrap config file
    scrape_config_file = './configs/scrape.ini'
    if not os.path.isfile(scrape_config_file):
        print("Scrape config file not found: " + scrape_config_file)
        sys.exit(0)
    config.read(scrape_config_file)

    # Verify config
    # Check that there is a log file to write to
    log_path = utils.create_path(os.path.expanduser(config['parser']['log_path']), is_dir=True)

    # Create logger to use
    logger = setup_custom_logger('root', os.path.join(log_path, "reddit_scraper.log"))

    # Check save path
    save_path = utils.create_path(os.path.expanduser(config['parser']['save_path']), is_dir=True)

    # Just json
    is_just_json = False
    if config['parser']['just_json'].strip().lower() == 'true':
        is_just_json = True
        # make sure that we are not saving in a dir where reddit content is saved
        if os.path.isdir(os.path.join(save_path, "user")):
            print("The save directory seems to be where you save reddit content\n \
                   Please pick a location that will be just for json files.")
            sys.exit(0)
        # Create file to say this is a json only directory
        open(os.path.join(save_path, "only_json.lock"), 'a').close()