示例#1
0
def run():
    start_year = args().start_year

    LOGGER.info("checking total page count")
    url = '%s%s' % (
        BASE_URL,
        args().title,
    )
    r = make_req(url)
    r = make_req("%s?searchform.when.from=%s-01-01&a=search" %
                 (r.url, start_year))
    bs = beautify(r.text)
    if args().page_range:
        start, end = args().page_range.split('-')
        start, end = int(start), int(end)
    else:
        start, end = find_page_count(bs)

    if all([start, end]):
        import multiprocessing
        from time import sleep
        from progressbar import ProgressBar, Bar, FormatLabel, RotatingMarker

        LOGGER.info(
            "generating urls between pages %d & %d for [%d - present]" %
            (start, end, start_year))

        urls = generate_urls(r.url, start, end)

        LOGGER.info("scraping dates from entries...")

        pool = multiprocessing.Pool(processes=8)
        results = pool.map_async(walk_page, urls, callback=squash_results)
        pool.close()

        remaining = results._number_left
        progress_bar = ProgressBar(widgets=[
            FormatLabel('-> '),
            RotatingMarker(),
            FormatLabel(' '),
            Bar()
        ],
                                   maxval=remaining).start()

        while True:
            if results.ready():
                break
            progress_bar.update(remaining - results._number_left)
            sleep(.05)

        progress_bar.finish()
        if results.ready():
            import itertools
            from sourgraph.graphs import make_graph

            title = args().title.lower()

            sorted_result_list = sorted(list(itertools.chain(*crawl_results)))
            LOGGER.info("generating graph...")
            top_date = make_graph(sorted_result_list,
                                  title=title,
                                  start_year=args().start_year,
                                  trim=args().trim)
            LOGGER.info("graph saved")
            if top_date and args().with_news:
                from sourgraph.web.hurriyet import return_news_url
                LOGGER.info("checking news...")
                news_url = return_news_url(top_date, title)
                if news_url:
                    LOGGER.info("news url: %s" % news_url)

                    import webbrowser
                    LOGGER.info("opening url...")
                    webbrowser.open(news_url, new=2)
                else:
                    LOGGER.info("couldn't find any news for '%s'" % title)
    LOGGER.info("bye!")
示例#2
0
import atexit

client = MongoClient()
db = client.dotabot
matches = db.matches

# Dataset manipulation
if isfile(TRAIN_FILE_NAME) and isfile(VALIDATION_FILE_NAME) and isfile(TEST_FILE_NAME):
    test_ds = SupervisedDataSet.loadFromFile(TEST_FILE_NAME)
    valid_ds = SupervisedDataSet.loadFromFile(VALIDATION_FILE_NAME)
    train_ds = SupervisedDataSet.loadFromFile(TRAIN_FILE_NAME)
    print "Training, validation and test datasets loaded"
else:
    ds = SupervisedDataSet(NUM_FEATURES, 1)

    widgets = [FormatLabel('Processed: %(value)d/%(max)d matches. '), ETA(), ' ', Percentage(), ' ', Bar()]
    pbar = ProgressBar(widgets = widgets, maxval = NUM_MATCHES).start()

    seen = set()
    r, d = 0, 0

    for i, record in enumerate(matches.find()):
        if record['match_id'] in seen:
            # print "Ignore redundant match {0}".format(record['match_id'])
            continue
        if not is_valid_match(record):
            # print "Ignore invalid match {0}".format(record['match_id'])
            continue
        seen.add(record['match_id'])
        y = 1.0 if record['radiant_win'] else 0.0
        if record['radiant_win']:
示例#3
0
import os

from progressbar import FormatLabel, Bar

CONFIG_PATH = os.path.expanduser(
    os.path.join(os.getenv("XDG_CONFIG_HOME", "~/.config"), "gphoto_backup"))

PBAR_WIDGETS = [FormatLabel("|%(value)d/%(max)d Albums"), Bar()]


def clear_progressbar(pbar):
    pbar.fd.write('\r' + (' ' * pbar.term_width) + '\r')


def mkdir_if_needed(path):
    try:
        os.mkdir(path)
    except OSError:
        pass
示例#4
0
                # we create a LineString for each Way, and we append it to the route ... (1)
                for node in nodes.nodes:
                    way.append((node.lon, node.lat))
                routes[routename].append(LineString(way))

for route in routes:
    # (1) ... and then we merge it in a single line here; it's important to note that
    # linemerge() returns a LineString or MultiLineString when lines are not contiguous
    routes[route] = linemerge(routes[route])

print('%d routes found' % len(routes), flush=True)

results = []

pbar = ProgressBar(
    widgets=[FormatLabel('Routes processed: %(value)d of %(max)d - '),
             ETA()],
    maxval=len(routes)).start()

for i, route in enumerate(sorted(routes)):
    # see above node on linemerge(), we handle the case of MultiLineString by forcing
    # lines to be always a list, eventually made by a single item
    if type(routes[route]) == LineString:
        lines = [
            routes[route],
        ]
    else:
        lines = routes[route]

    gmap = gmplot.GoogleMapPlotter(center_lng=lines[0].centroid.x,
                                   center_lat=lines[0].centroid.y,
示例#5
0
def zonaprop(driver, parametros, log, inputfile=None, tmpdir=os.getcwd(), inputparam=None):

    def _get_element(xpath):
        try:
            return driver.find_element_by_xpath(xpath).text
        except Exception:
            return ""

    def _get_dat_from_ul(xpath):
        try:
            d = {
                "ambientes": None,
                "baños": None,
                "antiguedad": None,
                "superficie": None,
                "superficie_cubierta": None,
                "cochera": "No",
                "toilette": "No",
            }

            ul = driver.find_element_by_xpath(xpath)
            for text in [el.text for el in ul.find_elements_by_tag_name("li")]:
                if "Ambientes" in text:
                    d["ambientes"] = text.split(" ")[0]
                if "Baños" in text:
                    d["baños"] = text.split(" ")[0]
                if "Antigüedad" in text:
                    d["antiguedad"] = text.split(" ")[0]
                if "Total" in text:
                    d["superficie"] = text.split(" ")[0]
                if "Cubierta" in text:
                    d["superficie_cubierta"] = text.split(" ")[0]
                if "Cochera" in text:
                    d["cochera"] = "Si"
                if "Toilette" in text:
                    d["toilette"] = "Si"
            return d

        except Exception:
            return []

    def _get_propiedad_data(url_propiedad):

        driver.get(url_propiedad)

        precio = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.XPATH, parametros["precio_xpath"]))
        )
        tipo_op = _get_element(parametros["price_type"])
        expensas = "'" + _get_element(parametros["expensas_xpath"])
        decripcion = _get_element(parametros["descripcion_xpath"])

        datos = _get_dat_from_ul(parametros["datos_ul_xpath"])
        mts_totales = datos["superficie"]
        mts_cubiertos = datos["superficie_cubierta"]
        ambientes = datos["ambientes"]
        baños = datos["baños"]
        cochera = datos["cochera"]
        toilette = datos["toilette"]
        antiguedad = datos["antiguedad"]

        direccion = _get_element(parametros["dir_xpath"])
        publicado = _get_element(parametros["publicado_xpath"])
        inmobiliaria = _get_element(parametros["inmobiliaria_xpath"])

        return (
            decripcion,
            direccion.replace('\r', '').replace('\n', '').replace('Ver en mapa', ''),
            precio.text,
            expensas,
            mts_totales,
            mts_cubiertos,
            ambientes,
            baños,
            toilette,
            cochera,
            antiguedad,
            publicado,
            inmobiliaria,
            url_propiedad,
        )

    datos = [('Detalle', 'Dirección', 'Precio', 'Expensas', 'Mts Totales', 'Mts Cubiertos', 'Ambientes', 'Baños', 'Toilette', 'Cochera',
            'Antiguedad', 'Publicado', 'Inmobiliaria', 'URL')]

    urls = list()

    if inputparam is not None:
        urls = [inputparam]
    else:
        with open(inputfile, "r") as f:
            urls = f.readlines()

    widgets = [FormatLabel(''), ' ', Percentage(), ' ', Bar('#'), ' ', ETA(), ' ', RotatingMarker()]
    bar = ProgressBar(widgets=widgets, maxval=len(urls))

    i = 1
    for url_propiedad in urls:
        url_propiedad = url_propiedad.strip()

        try:
            # a = 1/0
            datos.append(_get_propiedad_data(url_propiedad))
        except Exception as err:

            vacio = list("" for _ in range(len(datos[0])))
            vacio[-1] = "!!!Error: {0}".format(err)
            datos.append(tuple(vacio))

        widgets[0] = FormatLabel('[Prop: {0}]'.format(url_propiedad))
        bar.update(i)
        i = i + 1

    bar.finish()

    driver.quit()

    return datos
示例#6
0
busroutes = sorted(glob.glob('tfl_bus_routes/*.json'))

# we load the JSONs only once
print('Caching routes JSONs...')
routes = {}
for route in busroutes:
    with open(route) as f:
        routes[route] = json.load(f)

pairs = [x for x in combinations(busroutes, 2)]

results = []

pbar = ProgressBar(
    widgets=[FormatLabel('Pairs processed: %(value)d of %(max)d - '),
             ETA()],
    maxval=len(pairs)).start()

for i, (route1, route2) in enumerate(pairs):
    r1 = routes[route1]
    r2 = routes[route2]

    # dont process the same line
    if r1['lineName'] == r2['lineName']:
        continue

        # we first check the end of r1 and the start of r2
    r = three_stops_distance(DISTANCE,
                             r1['stopPointSequences'][0]['stopPoint'][-3:],
                             r2['stopPointSequences'][0]['stopPoint'][:3])
示例#7
0
def main():
    catalog = {}
    curr_data_date = None

    # Add some more to prevent error when new stocks found
    total = _total_stocks() + 10

    widgets = [
        FormatLabel(
            'Processed: %(value)d / {0} (in: %(elapsed)s)'.format(total))
    ]
    pbar = ProgressBar(widgets=widgets, maxval=total)
    count = 0
    pbar.start()
    state = common.load_state()

    for catalog_key, url in CATELOG.items():
        data_date, result = get_category_stock_info(url)
        if not result:
            raise Exception('Empty parsing result, key: {}, url: {}'.foramt(
                catalog_key, url))
        if curr_data_date is None:
            curr_data_date = data_date
        elif curr_data_date != data_date:
            msg = 'Data date is not the same!'\
                ' curr_data_date: %s, data_date: %s, url: %s'\
                % (curr_data_date, data_date, url)
            common.report_error(msg)
            raise Exception(msg)

        stype, category = catalog_key
        for stock_no, data in result.items():
            stock_data = common.load_stock(stock_no)
            daily_report = stock_data.setdefault(common.DAILY, {})
            meta = stock_data.setdefault(common.META, {})
            daily_report[data_date] = data
            category_key = SEPARATOR.join(catalog_key)
            meta.update({
                common.META_STOCK_NO:
                stock_no,
                common.META_COMPANY_TYPE:
                stype,
                common.META_COMPANY_CATEGORY:
                category,
                common.META_CATEGORY_KEY:
                category_key,
                common.META_NAME:
                data.pop('name'),
                common.META_DAYS:
                sorted(daily_report.keys(), reverse=True),
            })
            stock_data.setdefault(common.META, {}).update(meta)
            common.save_stock(stock_no, stock_data)
            catalog.setdefault(category_key, []).append(stock_no)
            pbar.update(count)
            count += 1

        if not catalog.setdefault(SEPARATOR.join(catalog_key), []):
            common.report_error('NO STOCK FOUND!!!! %s, %s' %
                                (catalog_key, url))
    common.save_catalog(catalog)
    state[common.CURRENT_DATA_DATE] = curr_data_date
    common.save_state(state)
    pbar.finish()
示例#8
0
def transfer(read_from, save_to):
    click.echo('%s --> %s' % (read_from, save_to))
    if read_from not in OPTIONS or save_to not in OPTIONS:
        print 'Should be %s or %s' % (LOCAL, FIREBASE)
        sys.exit(-1)
    if read_from == save_to:
        print 'Saving data to where it is from does not make sense.'
        sys.exit(-2)

    click.echo('This will OVERWRITE data in "%s". Are you sure? [y/N]' %
               save_to)
    confirm = sys.stdin.readline()
    if confirm.strip() != 'y':
        print 'byebye~'
        return

    common.READ_FROM = common.LOCAL if read_from == LOCAL else common.FIREBASE
    common.SAVE_TO = (common.LOCAL,)\
        if save_to == LOCAL else (common.FIREBASE,)

    print 'Transfering catalog...'
    catalog = common.load_catalog()
    common.save_catalog(catalog)

    print 'Transfering categories...'
    catalog = common.load_catalog()
    categories = common.load_categories()
    common.save_categories(categories)

    print 'Transfering filter results...'
    f_results = common.load_filter_results()
    common.save_filter_results(f_results)

    print 'Transfering indicator results...'
    i_results = common.load_indicator_results()
    common.save_indicator_results(i_results)

    print 'Transfering config...'
    config = common.load_config()
    common.save_config(config)

    todo = []
    for stocks in catalog.values():
        todo.extend(stocks)
    total = len(todo)
    print 'Transfering sotcks...'
    widgets = [
        FormatLabel(
            'Processed: %(value)d / {0} (in: %(elapsed)s)'.format(total))
    ]
    pbar = ProgressBar(widgets=widgets, maxval=total)
    count = 0
    pbar.start()
    for s in todo:
        data = common.load_stock(s)
        common.save_stock(s, data)
        pbar.update(count)
        count += 1
    pbar.finish()

    print 'Transfering state...'
    catalog = common.load_catalog()
    state = common.load_state()
    common.save_state(state)
示例#9
0
    perf_nans = np.isnan(perf_array)
    if (1 - perf_nans).sum() == 0:
        raise Exception('The selected metric evaluations are all nans')

    best_perf_expes = perf_array[perf_nans == False]  # NOQA
    bool_choice = op(best_perf_expes) == np.array(best_perf_expes)
    best = ar_expes[bool_choice]  # NOQA
    best_key = ar_keys[bool_choice]
    return best[0], best_key[0]


widgets = [
    Percentage(), ' ',
    SimpleProgress(), ' ',
    Bar(marker='=', left='[', right=']'), ' ',
    FormatLabel('in: %(elapsed)s'), ' ',
    ETA(), ' | ', 'job/',
    DynamicMessage('s')
]


class Ensemble(object):
    """Base class to build experiments containers able to execute batch
    sequences of action. Must implement the `fit`, `fit_gen`, `fit_async`
    `fit_gen_async` methods

    Args:
        experiments(dict or list): experiments to be wrapped. If a dictionnary
            is passed, it should map experiment names to experiments.
    """
    def __init__(self, experiments):
示例#10
0
def NodeDic(results, edge_info, node_info):
    '''
    Function takes the results of running a query, NETS edge label information, and a list of node information (list[0]
    contains the NETS nodes label triples, list[1] contains the contains the NETS nodes identifier triples). The
    function returns a list of dictionaries where list[0] contains a nested dictionary where keys are bio entity
    identifiers and the values are the the human readable labels and database identifiers; list[1] contains a dictionary
    where the bio node is the key and the value is a set of possible NETS node types for that node.
    :param results: json file containing the query results from endpoint
    :param edge_info: dictionary where the keys are the NETS edges and the values are the edge labels
    :param node_info: a list of node information (list[0] contains the NETS nodes label triples, list[1] contains the
    contains the NETS nodes identifier triples)
    :return: a list of dictionaries: list[0] contains a nested dictionary where keys are bio entity identifiers and the
    values are the the human readable labels and database identifiers; list[1] contains a dictionary where the bio node is
    the key and the value is a set of possible NETS node types for that node
    '''

    print 'Start building OWL-NETs metadata dictionary'

    # creates a map to store NETS node type information
    node_type = {}

    # creates a map to identify which query variables represent the BIO world ID, label, and ICE ID
    node_labeler = {}

    # assign variables needed for node dictionary
    NETS = set([x.strip('?') for y in edge_info[0].keys() for x in y])
    labels = [[re.sub('[?|"\n"]', '', x.split(' ')[0]), re.sub('[?|"\n"]', '', x.split(' ')[2])] for x in node_info[0]]
    ids = [[x.split(' ')[0].strip('?'), x.split(' ')[2].strip('?')] for x in node_info[1]]

    # initialize progress bar progress bar
    widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')]
    pbar = ProgressBar(widgets=widgets, maxval=len(NETS))

    for node in pbar(NETS):
        node_labeler[node] = {}

        for res in results['results']['bindings']:
            node_key = str(res[node]['value'])
            label_value = str([x[1] for x in labels if x[0] == node][0].encode('utf8'))
            id_value = str([x[0] for x in ids if x[1] == node][0].encode('utf8'))

            # NODE TYPE: setting node type information
            if node_key in node_type.keys():
                node_type[node_key].add(node)

            else:
                node_type[node_key] = set()
                node_type[node_key].add(node)

            # NODE METADATA: setting node attributes by NETS node type
            if node_key in node_labeler[node].keys():
                # order matters - not using a set so that each ICE can be mapped to the label with the same index
                node_labeler[node][node_key]['label'].append(res[label_value]['value'].encode('utf8'))
                node_labeler[node][node_key]['id'].append(res[id_value]['value'].encode('utf8'))

            else:
                node_labeler[node][node_key] = {}
                node_labeler[node][node_key]['label'] = [res[label_value]['value'].encode('utf8')]
                node_labeler[node][node_key]['id'] = [res[id_value]['value'].encode('utf8')]

    # close progress bar
    pbar.finish()
    print 'Finished building OWL-NETs metadata dictionary'
    print '\n'

    # CHECK: verify that the counts are correct
    for node in NETS:
        res_count = set()
        for res in results['results']['bindings']:
            res_count.add(res[node]['value'])

        if len(node_labeler[node].keys()) != len(res_count):  # verify the number of nodes in graph is correct
            raise ValueError('The count of results for the ' + str(node) + ' NETS node in the node dictionary differ '
                                                                           'from the query output')

    return node_labeler, node_type
示例#11
0
def NETSGraph(results, NETS_edges, node_labeler, node_type, edge_labeler):
    '''
    Function takes a json file of query results, a list of NETS edges, node and edge metadata dictionaries, and a
    dictionary containing NETS edge information by BIO node. Using these items the function creates the directed
    OWL-NETS abstraction network. Node metadata includes: labels (a list of human readable labels); id (the endpoint
    database identifiers); and bio (the NETS node type). Edge metadata includes: labels (human readable label for the
    edge between two NETS nodes) and id (the ontology concept term used to link the NETS nodes).
    :param results: json file containing the query results from endpoint
    :param NETS_edges: list of lists, where each list is a NETS edge and the order specifies a directional relationship
    :param node_labeler: node metadata nested lists (list[0] contains the NETS nodes label triples, list[1] contains the
    contains the NETS nodes identifier triples)
    :param node_type: dictionary with BIO node as key and set of NETS node types as value
    :param edge_labeler: dictionary where the keys are the NETS edges and the values are the edge labels
    :return: OWL-NETS directed graph
    '''
    print 'Started building OWL-NETS graph'

    # initialize progress bar progress bar
    widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')]
    pbar = ProgressBar(widgets=widgets, maxval=len(results['results']['bindings']))

    NETS_graph = nx.DiGraph()

    for res in pbar(results['results']['bindings']):
        for edge in NETS_edges:

            i = res[str(edge[0].strip('?').encode('utf8'))]['value'].encode('utf8')
            j = res[str(edge[1].strip('?').encode('utf8'))]['value'].encode('utf8')

            # set nodes
            NETS_graph.add_node(min(node_labeler[edge[0].strip('?')][i]['label'], key=len),
                                labels=node_labeler[edge[0].strip('?')][i]['label'],
                                id=node_labeler[edge[0].strip('?')][i]['id'],
                                bio=i,
                                type='-'.join(list(node_type[i])))

            # gets second node in edge
            NETS_graph.add_node(min(node_labeler[edge[1].strip('?')][j]['label'], key=len),
                                labels=node_labeler[edge[1].strip('?')][j]['label'],
                                id=node_labeler[edge[1].strip('?')][j]['id'],
                                bio=j,
                                type='-'.join(list(node_type[j])))
            # add edge
            NETS_graph.add_edge(min(node_labeler[edge[0].strip('?')][i]['label'], key=len),
                                min(node_labeler[edge[1].strip('?')][j]['label'], key=len),
                                labels=res[(edge_labeler[tuple(edge)]['label']).strip('?')]['value'].encode('utf8'),
                                id=(edge_labeler[tuple(edge)]['id']).strip('?'),
                                edge='-'.join([edge[0].strip('?'), edge[1].strip('?')]))


    # closes first progress bar
    pbar.finish()
    print 'Finished building OWL-NETS graph'
    print '\n'

    # print information about graph
    print 'Directed OWL-NETS Graph has ' + str(len(NETS_graph.nodes())) + ' nodes, ' + str(
        len(NETS_graph.edges())) + ' edges, and ' + str(
        nx.number_connected_components(NETS_graph.to_undirected())) + ' connected component(s)'

    return NETS_graph
示例#12
0
    g_optimizer = torch.optim.Adam(g_net.parameters(),
                                   lr=arg.lr,
                                   betas=(0.5, 0.999))
    d_optimizer = torch.optim.Adam(d_net.parameters(),
                                   lr=arg.lr,
                                   betas=(0.5, 0.999))

    log_file = open(arg.log_file, 'w')

    for epoch in range(1, arg.epochs + 1):

        print('Epoch: {}/{}'.format(epoch, arg.epochs))
        g_total_loss, d_total_loss = 0, 0

        widgets = [
            FormatLabel(''), ' ',
            Bar('=', '[', ']'), ' - ',
            ETA(), ' ',
            FormatLabel('')
        ]
        pbar = ProgressBar(widgets=widgets, maxval=x_train.shape[0])
        pbar.start()

        for i, (real_img, real_tag) in enumerate(train_loader):
            for p in d_net.parameters():
                p.requires_grad = True

            noise = Variable(torch.randn(real_img.size()[0], arg.noise_dim),
                             volatile=True).cuda()
            wrong_tag = Variable(get_wrong_tag(real_tag)).cuda()
            real_img = Variable(real_img).cuda()
示例#13
0
def generate_bar(ln, text):
    return pb(min_value=0, max_value=ln, widgets=[FormatLabel(text), ] + base_widgets)
示例#14
0
 def finish(self, total):
     msg = '[Patching {0} ASGs]: {0} Complete'.format(total)
     self.widgets[4] = FormatLabel(msg)
     self.progress.finish()
示例#15
0
        for b in meta.active_branches():
            chain.SetBranchStatus(b, 1)
        chain.SetBranchStatus('run', 1)
        chain.SetBranchStatus('lumi', 1)
        chain.SetBranchStatus('evt', 1)
        for b in args.branches:
            chain.SetBranchStatus(b, 1)
    except:
        raise
        log.warning("Couldn't get meta tree - will not disable branches")

    passed_events = []

    nrows = chain.GetEntries()
    pbar = ProgressBar(widgets=[
        FormatLabel('Processed %(value)i/' + str(nrows) + ' rows. '),
        ETA(),
        Bar('>')
    ],
                       maxval=nrows).start()
    pbar.update(0)

    for row in xrange(nrows):
        pbar.update(row)
        chain.GetEntry(row)
        all_passed = True
        for name, selection in selections:
            passed = selection(chain)
            if not passed:
                all_passed = False
                break
def process_rocstories(stories):
    """
    - randomly select one missing sentence
    - randomly select accepted words: a list of (position, word)
    - randomly select keywords (excluding accepted words): a unordered list of words
    :param stories:
    :return:
    """
    story_size = len(stories)
    ''' Get missing sent indexes '''
    missing_sent_indexes = [random.randint(1, 3) for _ in range(story_size)]
    hist, bins = np.histogram(missing_sent_indexes, bins=3)
    print("-" * 80)
    print("Histogram of missing sent indexes")
    print(" ".join(["%5d" % b for b in bins[1:]]))
    print(" ".join(["%5d" % h for h in hist]))
    print("-" * 80)

    widgets = [
        FormatLabel('Processed: %(value)d stories (in: %(elapsed)s)'),
        Percentage(), " | ",
        SimpleProgress(), " | ",
        Bar()
    ]
    pbar = ProgressBar(widgets=widgets)

    stories_processed = {l: [] for l in range(4, 14)}
    for i in pbar(range(len(stories))):
        story, missing_idx = stories[i], missing_sent_indexes[i]
        entity = ROCStoriesEntity(story, missing_idx)
        l = entity.missing_sent_len_np
        if l < 5:
            l = 4
        if l > 12:
            l = 13
        stories_processed[l].append(entity)

    a_lens = {}
    k_lens = {}
    for l, bucket in stories_processed.items():
        random.seed(l)

        a_lens[l] = []

        for entity in bucket:
            # select accepted words
            accepted_len = random.randint(0, l - KEYWORD_MIN - 1)

            # # select keywords
            # keywords_len = l+1
            # while accepted_len + keywords_len > l:
            #     keywords_len = random.randint(0, l-1)

            # a_lens[l].append(accepted_len)
            # k_lens[l].append(keywords_len)

            # assert (accepted_len + keywords_len) <= l, "Missing sent ACPT/KEY selection: Something went wrong: [accepted:%d][keywords:%d][sent_len:%d]"%(accepted_len, keywords_len, l)

            entity.accepted_words = [
                (i, tok.lower())
                for i, tok in zip(entity.missing_sent_tokens_randomized_np_idx,
                                  entity.missing_sent_tokens_randomized_np)
            ][:accepted_len] if accepted_len > 0 else []
            # entity.keywords = [tok for tok in entity.missing_sent_tokens_randomized_np[accepted_len:keywords_len]] if keywords_len > 0 else []
            a_lens[l].append(len(entity.accepted_words))

    for l, bucket in stories_processed.items():
        random.seed(l)

        k_lens[l] = []
        entities_to_remove = []

        for entity in bucket:
            # select accepted words
            # accepted_len = random.randint(0, l-1)
            accepted_len = len(entity.accepted_words)

            # select keywords
            keywords_len = l + 1
            while accepted_len + keywords_len > l:
                keywords_len = random.randint(KEYWORD_MIN, l - accepted_len)

            # A little nudge to push the distribution to the right
            if accepted_len + keywords_len < l:
                if random.random() > 0.5:
                    keywords_len += 1

            assert (
                accepted_len + keywords_len
            ) <= l, "Missing sent ACPT/KEY selection: Something went wrong: [accepted:%d][keywords:%d][sent_len:%d]" % (
                accepted_len, keywords_len, l)

            entity.keywords = [
                tok for tok in entity.
                missing_sent_tokens_randomized_np[accepted_len:accepted_len +
                                                  keywords_len]
            ] if keywords_len > 0 else []
            if len(entity.keywords) == 0:
                print("=> keywords_len: %d produced 0 keywords" %
                      (keywords_len))
                print("\t[awl: %d], src_token_len: %d (%s)" %
                      (len(entity.accepted_words),
                       len(entity.missing_sent_tokens_randomized_np),
                       entity.title))
                entities_to_remove.append(entity)
                continue

            k_lens[l].append(len(entity.keywords))

        for e in entities_to_remove:
            stories_processed[l].remove(e)
            print("Removing: %s" % (e.title))

    for l in a_lens:
        a_bucket = a_lens[l]
        k_bucket = k_lens[l]
        # print_histogram(l, a_bucket, "Accepted words")
        # print_histogram(l, k_bucket, "Keywords")
        print_histogram2(l, a_bucket, k_bucket, "Accepted words", "Keywords")

    return stories_processed
 def update_msg(msg):
     widgets[0] = FormatLabel(f"[{i:4d}/{n_files}] {msg}")
示例#18
0
def main():
    """
    Entry point
    """
    working_directory = getcwd()

    parser = ArgumentParser(description='')
    parser.add_argument('-i',
                        '--input_folder',
                        dest='input',
                        metavar='INPUT_DIRECTORY',
                        required=False,
                        default=working_directory,
                        help='Source directory for files renaming. '
                        'Current directory by default')
    args = parser.parse_args()

    files = [
        join(args.input, file) for file in listdir(args.input)
        if isfile(join(args.input, file))
    ]
    images_files = [file for file in files if is_image(file)]
    video_files = [file for file in files if is_video(file)]

    total_files = len(images_files) + len(video_files)

    widgets = [
        FormatLabel('Extracting info'), ' ',
        Percentage(), ' ',
        Bar(), ' ',
        ETA()
    ]

    progress_bar = ProgressBar(maxval=total_files,
                               redirect_stdout=True,
                               widgets=widgets)
    progress_bar.start()

    images_info_map = {}
    file_counter = 0

    for file in images_files:
        file_counter += 1
        progress_bar.update(file_counter)
        images_info_map[file] = exif_time_else_creation_time(file)

    video_files_map = {}

    for file in video_files:
        file_counter += 1
        progress_bar.update(file_counter)
        video_files_map[file] = creation_time(file)

    progress_bar.finish()

    image_renamings = calculate_renamings(images_info_map)
    video_renamings = calculate_renamings(video_files_map)

    image_renamings = dump_renamings(image_renamings)
    video_renamings = dump_renamings(video_renamings)

    if ask_yesno(msg='Confirm renaming', dft='y'):
        rename_files(image_renamings, label='Renaming image files')
        rename_files(video_renamings, label='Renaming video files')
示例#19
0
文件: kfcv_d.py 项目: yulkins/dotaml
preprocessed = np.load('train_51022.npz')
X = preprocessed['X']
Y = preprocessed['Y']

NUM_MATCHES = 20000
X = X[0:NUM_MATCHES]
Y = Y[0:NUM_MATCHES]

print 'Training using data from %d matches...' % NUM_MATCHES

k_fold = cross_validation.KFold(n=NUM_MATCHES, n_folds=K, indices=True)

d_tries = [3, 4, 5]

widgets = [
    FormatLabel('Processed: %(value)d/%(max)d folds. '),
    ETA(),
    Percentage(), ' ',
    Bar()
]
pbar = ProgressBar(widgets=widgets, maxval=(len(d_tries) * K)).start()

d_accuracy_pairs = []
for d_index, d in enumerate(d_tries):
    model = KNeighborsClassifier(n_neighbors=NUM_MATCHES / K,
                                 metric=my_distance,
                                 weights=poly_param(d))
    model_accuracies = cross_validation.cross_val_score(model,
                                                        X,
                                                        Y,
                                                        scoring=score,
示例#20
0
class ScrapeJam:  # Here's your chance, do your dance, at the ScrapeJam

    widgets = [
        Percentage(),
        Bar(),
        FormatLabel(' %(value)d/%(max)d '),
        ETA(),
        FormatLabel(' (%(elapsed)s)')
    ]

    def __init__(self, filepath, errorlog=None):
        self.file = filepath
        self.log = errorlog
        self.win = curses.initscr()
        curses.start_color()
        curses.curs_set(0)
        curses.noecho()
        curses.cbreak()
        self.refresh()

    def __del__(self):
        pass

    def write(self, file, data):
        f = open(file, 'w')
        json.dump(data, f, encoding='utf-8')
        f.close()

    def refresh(self, clear=True):
        if clear:
            self.win.clrtobot()
        self.win.refresh()

    def move(self, y, x):
        self.win.move(y, x)
        self.refresh(False)

    # TODO: fill in UUIDs
    def scrape(self, artists, album_fn, song_fn, lyric_fn, errorlog=None):
        """Fetches rows from a Bigtable.

		Args:
			artists		List of tuples (artist_name, artist_url)
			album_fn	Func(artist_tuple): returns list of tuples (album_name, album_url)
			song_fn		Func(artist_tuple, album_tuple): returns list of tuples (song_name, song_url)
			lyric_fn	Func(artist_tuple, album_tuple, song_tuple): returns lyrics or None
		"""
        def errorwrap(fn):
            def wrapped(*args, **kwargs):
                try:
                    return fn(*args, **kwargs)
                except Exception as e:
                    error(e)
                    return [
                    ]  # In the case that error() just logged and continued

            return wrapped

        def error(e):
            if self.log:
                # Log the error and continue on your merry way (or explode w/o log file)
                self.errorlist.append(
                    {artist[0]: [song[1], traceback.format_exc()]})
            else:
                raise e

        album_fn = errorwrap(album_fn)
        song_fn = errorwrap(song_fn)
        lyric_fn = errorwrap(lyric_fn)
        i = [0, 0, 0]  # Incrementors for artists, albums, and songs
        done = [0, 0, 0]  # Counter for completed scrapes
        data = {}
        self.errorlist = []
        try:
            self.artists_pbar = ProgressBar(widgets=[' Artists:'] +
                                            self.widgets,
                                            maxval=len(artists)).start()
            for artist in artists:
                albums = album_fn(artist)
                if len(albums) == 0: continue
                i[SJ_ALBUM] = 0
                data[artist[0]] = {
                    'albums': {},
                    'uuid': None,
                    'url': artist[1]
                }
                self.albums_pbar = ProgressBar(widgets=[' Albums: '] +
                                               self.widgets,
                                               maxval=len(albums)).start()
                for album in albums:
                    songs = song_fn(artist, album)
                    if len(songs) == 0: continue
                    i[SJ_SONG] = 0
                    data[artist[0]]['albums'][album[0]] = {
                        'songs': {},
                        'uuid': None,
                        'url': album[1]
                    }
                    self.songs_pbar = ProgressBar(widgets=[' Songs:  '] +
                                                  self.widgets,
                                                  maxval=len(songs)).start()
                    for song in songs:
                        # lyrics = lyric_fn(artist, album, song)
                        lyrics = 'xxx'
                        if len(lyrics) == 0: continue
                        data[artist[0]]['albums'][album[0]]['songs'][
                            song[0]] = {
                                'lyrics': lyrics,
                                'uuid': None,
                                'url': song[1]
                            }
                        i[SJ_SONG] += 1
                        done[SJ_SONG] += 1
                        self.drawProgress((artist[0], album[0], song[0]), i,
                                          done)
                    i[SJ_ALBUM] += 1  # Completed an album
                    done[SJ_ALBUM] += 1
                i[SJ_ARTIST] += 1  # Completed an artist
                done[SJ_ARTIST] += 1
                htmlCache = {}  # Reset HTML cache after each artist
        except Exception:
            curses.nocbreak()
            curses.echo()
            curses.endwin()
            traceback.print_exc()
            print "Ended on (%s) (%s) (%s)" % (artist[0], album[0], song[0])
        except KeyboardInterrupt:
            pass
        finally:  # Must be run to restore terminal's state to normal
            curses.nocbreak()
            curses.echo()
            curses.endwin()
            self.write(self.file, data)
            if self.log and len(self.errorlist) != 0:
                self.write(self.log, self.errorlist)

    def drawProgress(self, names, values, done):
        self.move(0, 0)
        self.artists_pbar.update(values[0])
        self.move(1, 0)
        self.albums_pbar.update(values[1])
        self.move(2, 0)
        self.songs_pbar.update(values[2])
        self.win.addstr(4, 0, " Artist: " + names[0].encode('utf8'))
        self.refresh()
        self.win.addstr(
            5, 0,
            " Album:  " + (names[1] if names[1] else "N/A").encode('utf8'))
        self.refresh()
        self.win.addstr(6, 0, " Song:   " + names[2].encode('utf8'))
        self.refresh()
        self.win.addstr(8, 0, " COMPLETED")
        self.win.addstr(9, 0,
                        " Artists: %d  Albums: %d  Songs: %d" % tuple(done))
        if self.log:
            self.win.addstr(10, 0, " Fatal errors: %d" % len(self.errorlist))
        self.refresh()
        self.move(2, 0)
示例#21
0
NUM_HEROES = 108
NUM_FEATURES = NUM_HEROES * 2

# Our training label vector, Y, is a bit vector indicating
# whether radiant won (1) or lost (-1)
NUM_MATCHES = matches.count()

# Initialize training matrix
X = np.zeros((NUM_MATCHES, NUM_FEATURES), dtype=np.int8)

# Initialize training label vector
Y = np.zeros(NUM_MATCHES, dtype=np.int8)

widgets = [
    FormatLabel('Processed: %(value)d/%(max)d matches. '),
    ETA(),
    Percentage(), ' ',
    Bar()
]
pbar = ProgressBar(widgets=widgets, maxval=NUM_MATCHES).start()

for i, record in enumerate(matches.find()):
    pbar.update(i)
    Y[i] = 1 if record['radiant_win'] else -1
    players = record['players']
    for player in players:
        hero_id = player['hero_id'] - 1

        # If the left-most bit of player_slot is set,
        # this player is on dire, so push the index accordingly
示例#22
0
    # We do this here to prevent ROOT from messing with sys.argv
    import ROOT

    if not os.path.exists(args.outputdir):
        os.makedirs(args.outputdir)

    log.info("Finding input files for job: %s in %s"
             % (args.jobid, args.directory))

    for sample_name, search_dir, all_files in find_sample_dirs(
            args.directory.split(':'), args.jobid):
        output_txt = os.path.join(args.outputdir, sample_name + '.txt')
        previous_files = get_previous_files(output_txt)
        with open_update_if_changed(output_txt, sample_name) as flist:
            pbar = ProgressBar(widgets=[FormatLabel(
                'Checked %(value)i/' + str(len(all_files)) + ' files. '),
                ETA(), Bar('>')], maxval=len(all_files)).start()

            for i, file in enumerate(all_files):
                pbar.update(i)
                filepath = file
                if args.relative:
                    filepath = os.path.relpath(file, search_dir)
                # Always write if we have found + checked it OK before
                if not args.nocheck and (args.force
                                         or file not in previous_files):
                    tfile = ROOT.TFile.Open(file)
                    if not tfile:
                        log.warning("-- Can't open file: %s" % file)
                        flist.write('# corrupt %s\n' % filepath)
                        continue
示例#23
0
def parse_mails(indir, outdir):
    global firstnames
    global lastnames
    firstnames = open(FIRSTNAMES_FILE, 'r').read().split('\n')
    lastnames = open(LASTNAMES_FILE, 'r').read().split('\n')
    all_references = {}
    all_other = []

    print()
    print('****************** Mail Converter for mbox-Format Emails ************************')
    print('     V1.0 - 2017-08-01, Copyright (c) 2017 MUNICH AILABS GmbH')
    print('                   Written: 2017-08-01 ... 15, Imdat Solak')
    print('                            All rights reserved.')
    print('----------------------------------------------------------------------------------')
    print()
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    # First collect all mails.
    # Some may contain references, others may not...
    print('Scanning directory [%s]... ' % indir, end='')
    filenames = []
    sys.stdout.flush()
    for root, dirs, files in os.walk(indir):
        for filename in filter(lambda filename: filename.endswith('.eml'), files):
            if not filename.startswith('._'):
                filenames.append(os.path.join(indir, filename))
    print('done')
    print('Parsing files...')
    widgets=[FormatLabel('File: [%(value)s/'+str(len(filenames))+']'), ' ', Percentage(), ' ', Bar(marker='@', left='[', right=']'), ' ', ETA()]

    pBar = ProgressBar(widgets=widgets, maxval=len(filenames)).start()
    for i, filename in enumerate(filenames):
        pBar.update(i, '')
        raw_message = codecs.open(filename, 'r', 'utf-8').read()
        msg = email.message_from_string(raw_message)
        mail = ASCIIMail(msg, os.path.join(outdir, os.path.basename(filename) + '.json'))
        mail.parse()
        if mail.reference != None:
            if all_references.get(mail.reference, None) == None:
                all_references[mail.reference] = mail
            else:
                all_references[mail.reference].append_reference(mail)
        else:
            all_other.append(mail)

    pBar.finish()
    print('Merging mails...')
    widgets=[FormatLabel('File: [%(value)s/'+str(len(all_references.keys()))+']'), ' ', Percentage(), ' ', Bar(marker='@', left='[', right=']'), ' ', ETA()]
    pBar = ProgressBar(widgets=widgets, maxval=len(all_references.keys())).start()
    # Now check for references...
    for i, a_ref in enumerate(all_references.keys()):
        pBar.update(i, '')
        amail = all_references[a_ref].parse_references()
        all_other.append(amail)

    pBar.finish()
    all_references = {}

    # Now save the found emails...
    print('Saving files... ', end='')
    sys.stdout.flush()
    for amail in all_other:
        amail.save()
    print('done)')
示例#24
0
    log.info("Merging %i input ROOT files", len(flat_files))

    # Loop over (in, out) pairs
    for tree, h5name in zip(args.trees[::2], args.trees[1::2]):
        log.info("Writing input %s to output %s", tree, h5name)

        chain = ROOT.TChain(tree)
        for file in flat_files:
            chain.Add(file)

        entries = chain.GetEntries()
        log.info("There are %i rows in the input", entries)

        pbar = ProgressBar(widgets=[
            FormatLabel('Processed %(value)i/' + str(len(flat_files)) +
                        ' files. '),
            ETA(),
            Bar('>')
        ],
                           maxval=len(flat_files)).start()

        table = None

        ROOT.TTreeCache.SetLearnEntries(1)

        time_in_read = 0
        time_in_append = 0
        processed_files = 0

        try:
            for file_chunk in chunk_files(flat_files, args.chainsize):
示例#25
0
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_func = torch.nn.CrossEntropyLoss()

    print('Start training.')

    best_ed = 999
    early_stop_cnt = 0

    for epoch in range(1, epochs + 1):

        print('Epoch: {}/{}'.format(epoch, epochs))

        total_loss, total_acc, nonzeros = 0, 0, 0

        widgets = [
            FormatLabel(''), ' ',
            Bar('=', '[', ']'), ' - ',
            ETA(), ' ',
            FormatLabel('')
        ]
        pbar = ProgressBar(widgets=widgets, maxval=x_train.shape[0])
        pbar.start()

        for i, (x_batch, y_batch) in enumerate(train_loader):
            # Tensor to variable
            x_batch = Variable(x_batch).cuda()
            y_batch = Variable(y_batch).cuda()

            # Optimize
            output = model(x_batch)
            loss = loss_func(output.view(-1, output.size(-1)),
示例#26
0
            'parents': parent_terms
        }
    }
    return data


print('Loading de-wiktionary.json...', end='')
sys.stdout.flush()
pages = json.load(codecs.open('in/de-wiktionary.json', 'r', 'utf-8'))
print(' done')
print('Parsing...', end='')
sys.stdout.flush()
result = {}
num_articles = len(pages)
widgets = [
    FormatLabel('   Article: %(message)s [%(value)s/' + str(num_articles) +
                ']'), ' ',
    Percentage(), ' ',
    Bar(marker='#', left='[', right=']'), ' ',
    ETA()
]
pBar = ProgressBar(widgets=widgets, maxval=num_articles).start()

for i, page in enumerate(pages):
    pBar.update(i, page['title'])
    result.update(clean_article(page))

pBar.finish()
json.dump(result,
          codecs.open('out/de-wiktionary-db.json', 'w', 'utf-8'),
          indent=4)
示例#27
0
def example16():
    widgets = [FormatLabel('Bouncer: value %(value)d - '), BouncingBar()]
    pbar = ProgressBar(widgets=widgets)
    for i in pbar((i for i in range(180))):
        time.sleep(0.05)
示例#28
0
def get_pattern_features(infile):
    pool = mp.Pool(processes=mp.cpu_count() - 1)
    text_dict = defaultdict(lambda: defaultdict(Counter))
    tknzr = TweetTokenizer(reduce_len=True)
    pat_counter = Counter()
    pat_list = set()
    text_list = list()
    id_list = list()
    y = list()
    widgets = [FormatLabel('Processed: %(value)d records (in: %(elapsed)s)')]
    pbar = ProgressBar(widgets=widgets)
    with open(infile, 'r') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        header = next(spamreader)
        for row in pbar((row for row in spamreader)):
            text_list.append(row)
            id_list.append(row[0])
            y.append(row[1])
        pbar.finish()

    cpus = mp.cpu_count() - 1
    unit = int(len(text_list) / cpus)
    text_chunks = [text_list[i * unit:i * unit + unit] for i in range(cpus)]
    text_chunks[cpus - 1].extend(text_list[unit * cpus:])

    res = [
        pool.apply_async(get_pattern_counter, (text_chunks[i], ))
        for i in range(cpus)
    ]
    pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(res)).start()
    index = 0
    pool.close()
    pool.join()
    print("Extracting Eric's patterns...")
    for item in res:
        pat_counter += item.get()
        pbar.update(index + 1)
        index += 1
    pbar.finish()

    pat_list = list(zip(*(pat_counter.most_common()[0:5000]))[0]) + list(
        zip(*(pat_counter.most_common()[-5000:-1]))[0])
    print len(pat_list)

    X = list()
    pbar = ProgressBar(widgets=[Percentage(), Bar()],
                       maxval=len(id_list)).start()
    index = 0
    print("Generating Eric's pattern features...")
    for i, idx in enumerate(id_list):
        X.append([text_dict[idx][y[i]][pat] for pat in pat_list])
        pbar.update(index + 1)
        index += 1
    pbar.finish()

    X = np.array(X)
    y = np.array(y)
    print(X.shape)
    print(y.shape)

    return X, y, id_list
示例#29
0
def example11():
    widgets = [FormatLabel('Processed: %(value)d lines (in: %(elapsed)s)')]
    pbar = ProgressBar(widgets=widgets)
    for i in pbar((i for i in range(150))):
        time.sleep(0.1)
示例#30
0
def gather_result():
    start_cur = 0
    end_cur = len(gl.list_OK)
    bar_length = len(gl.list_OK)
    if gl.g_time != 0:
        for index in range(len(gl.list_OK)):
            if right_swich(gl.list_OK[index][4]) > swich_time(gl.g_time):
                start_cur = index
                break
    if gl.g_number != -1:
        end_cur = start_cur + gl.g_number
        bar_length = gl.g_number
    widgets = [
        'Gather result: ',
        Percentage(), ' ',
        Bar(marker='|', left='|', right='|'), '[',
        FormatLabel('%(elapsed)s'), ']'
    ]
    if gl.g_number == 0 or len(gl.list_OK) == 0:
        end_cur = 100
        bar_length = 100
    pbar = ProgressBar(widgets=widgets, maxval=bar_length)
    pbar.start()
    diff_time = 0
    glb_line = ''
    glb_line_number = 0
    tar_handle = open(gl.file_output, 'w')
    if gl.file_flag == 2:
        tar_handle.write('QuoteId\tQuoteId\tOrderBookID\tDiffTime\n')
    elif gl.file_flag == 1:
        tar_handle.write('bid_quote_id\task_quote_id\tfeedcode\tdiff_time\n')
    try:
        for cur_i in range(start_cur, end_cur):
            if gl.g_number == 0 or len(gl.list_OK) == 0:
                pass
            else:
                if cur_i < len(gl.list_OK):
                    for cur_j in range(3):
                        glb_line = glb_line + get_value(
                            gl.list_OK[cur_i][cur_j]) + '\t'
                    diff_time = right_swich(
                        gl.list_OK[cur_i][-2]) - right_swich(
                            gl.list_OK[cur_i][-1])
                    if cur_i == 0:
                        gl.max_time = diff_time
                        gl.min_time = diff_time
                    elif gl.max_time < diff_time:
                        gl.max_time = diff_time
                    elif gl.min_time > diff_time:
                        gl.min_time = diff_time
                    gl.average_time += diff_time
                    glb_line = glb_line + str(diff_time) + '\t\n'
                    tar_handle.write(glb_line)
                    glb_line_number += 1
                    glb_line = ''
            pbar.update(cur_i)
        pbar.finish()
    finally:
        tar_handle.close()
    if gl.g_number == 0 or len(gl.list_OK) == 0:
        gl.min_time = 0
    else:
        gl.average_time = gl.average_time / float(glb_line_number)
    print "--------------------------------"
    print "%15s%d\n%15s%d\n%15s%d\n%15s%f" % (
        "Total Number:", glb_line_number, "Max_time:", gl.max_time,
        "Min_time:", gl.min_time, "Average_time:", gl.average_time)
    print "--------------------------------"