Exemplo n.º 1
0
 def generate_pagination(self, url):
     urls = []
     for i in progress(range(20, 7000, 20)):
         url = 'https://www.tripadvisor.de/Restaurants-g187275-oa{}-Germany.html#LOCATION_LIST'.format(
             i)
         urls.append(url)
     return urls
Exemplo n.º 2
0
def copy_blob_as_remote(blob_url, copied_blob):
    """Copy blob as append file."""
    # Copies as Append file
    count = 0

    # Get file size
    resp = requests.get(blob_url, stream=True)
    total_size = int(resp.headers.get("content-length", 0))
    prog = progress(total=total_size, unit="iB", unit_scale=True)

    # Start copy process
    copied_blob.start_copy_from_url(blob_url)
    props = copied_blob.get_blob_properties()

    while props.copy.status == "pending":

        _LOGGER.info(props.copy.status + " " + props.copy.progress)

        count = count + 1
        if count > 100:
            raise TimeoutError("Timed out waiting for async copy to complete.")
        time.sleep(5)

        length = int(props.copy.progress.split("/")[0])
        diff = length - prog.n
        prog.update(diff)

        props = copied_blob.get_blob_properties()

    prog.close()
def test_progress():
    """Test progress bar."""
    total_size = 100
    prog = progress(total=total_size, unit="iB", unit_scale=True)

    for i in range(100):
        prog.update(i)
def getHazardDataFrame(df):
    '''Get hazard data
    
    Args:
        df (Pandas DataFrame): the dataframe containing survey data
    
    Returns:
        DataFrame: processed DataFrame. Almost ready to use with lifelines.
    '''
    survey_year = df['year'].iloc[0]
    Mother.collection_year = survey_year
    Mother.master_df = df
    # make a list of all unique mother id
    id_num_list = df['idhspid'].tolist()
    id_num_list = list(dict.fromkeys(id_num_list))
    # create a Mother object for each id
    mother_list = [
        Mother(num)
        for num in progress(id_num_list, desc='Creating mother objects')
    ]
    # create an array containing all relevant data
    mother_arrays = [mother.genHazardArray() for mother in mother_list]
    data = np.column_stack(mother_arrays).T
    out_df = pd.DataFrame(
        data=data,
        columns=['IDHSPID', 'Event Time', 'Event Occured', 'DHSID', 'Year'])

    return out_df
def main():
    # get command-line arguments
    cmd_args = commandLineParser()
    # assign Class variable to the correct DataFrame
    input_df = pd.read_csv(cmd_args.input_csv)
    if cmd_args.hazard_regressions:
        df = getHazardDataFrame(input_df)
    else:
        survey_year = input_df['year'].iloc[0]
        Mother.collection_year = survey_year
        Mother.master_df = input_df
        # make a list of all unique mother id
        id_num_list = input_df['idhspid'].tolist()
        id_num_list = list(dict.fromkeys(id_num_list))
        # create a Mother object for each id
        mother_list = [
            Mother(num)
            for num in progress(id_num_list, desc='Creating mother objects')
        ]
        # create an array containing all relevant data
        mother_arrays = [mother.genDataArray() for mother in mother_list]
        data = np.concatenate(mother_arrays)
        df = pd.DataFrame(
            data=data,
            columns=['DHSID', 'IDHSPID', 'Year', 'Mother\'s Age', 'Baby?'])
    # make a new DataFrame and export as csv
    df.to_csv(cmd_args.output_csv, index=False)
Exemplo n.º 6
0
def download_file(url, dte, ignore_header_rows=0):
    """Download usage report with shared access key based URL."""
    skipped_header = False
    size = 1024  # 1 Kibibyte

    local_filename = "usage-%s.csv" % (dte.isoformat())
    local_filename = local_filename.replace(":", "-")
    # NOTE the stream=True parameter
    resp = requests.get(url, stream=True)
    resp.encoding = "utf-8"
    total_size = int(resp.headers.get("content-length", 0))

    prog = progress(total=total_size, unit="iB", unit_scale=True)
    with open(local_filename, "wb") as csvfile:
        for chunk in resp.iter_content(chunk_size=size, decode_unicode=True):

            if ignore_header_rows and not skipped_header:
                bom = chunk[0]
                lines = chunk.split("\r\n")
                joined = bom + "\r\n".join(lines[ignore_header_rows:])

                encoded_chunk = joined.encode()

                skipped_header = True
            else:
                encoded_chunk = chunk.encode()

            if encoded_chunk:  # filter out keep-alive new chunks
                prog.update(len(encoded_chunk))
                csvfile.write(encoded_chunk)
                # f.flush() commented by recommendation from J.F.Sebastian
    prog.close()
    return (local_filename, total_size)
Exemplo n.º 7
0
def importPrecipData(month_range, windows='', precip_data_folder='./resources/precip_data', testing=False):
    '''This function imports all precip data in ./resources/precip_data or another specified folder
    
    Args:
        month_range (list): a list of months across which to sum the rainfall
        windows (str, optional): a string representing the path to the file containing the names of the precip files. Defaults to the empty string.
        precip_data_folder (str, optional): a string representing the path to the folder in which all of the .precip files are stored. Defaults to './resources/precip_data'
        testing (bool, optional): wheter or not the function is in testing mode. If so, only the first ten precip files will be considered for speed. Defaults to False
    
    Returns:
        list: a list of parsed precip data. Of the form [[[x1, y1], SUM2], [[x2, y2], SUM2], ...] where SUM is the sum of the rainfall in the selected months
    '''
    # get list of precip files
    if windows:
        precip_contents = fp.precipListParser(windows, testing=testing)
    else:
        os.system(f'cd {precip_data_folder}; ls precip* > ../../precip.txt')
        precip_contents = fp.precipListParser('precip.txt', testing=testing)
        os.system('rm precip.txt')
    # modify the path variable
    precip_contents = ['./resources/precip_data/' + file for file in precip_contents]
    # create precip data list for them all
    precip_data = [fp.precipFileParser(path, month_range) for path in progress(precip_contents, desc='Importing precip data')]

    return precip_data
Exemplo n.º 8
0
    def train_model(self, epochs=10000, summary_every=1, log_writer=None):
        subimage_provider = self._subimage_provider
        evaluation_input, evaluation_target = subimage_provider.evaluation_subimages(
        )
        evaluation_feed_dict = {
            self._net_input: evaluation_input,
            self._net_target: evaluation_target
        }

        updates_per_epoch = int(
            math.ceil(subimage_provider.training_subimage_count() /
                      self._batch_size))

        sess = self.session
        training_loss_values = []
        evaluation_loss_values = []
        min_train_loss = 10e3
        min_eval_loss = 10e3
        for e_num, epoch in enumerate(
                progress(range(epochs),
                         desc='Training model...',
                         unit='epochs')):
            for u_num, update in enumerate(range(updates_per_epoch)):
                input_batch, target_batch = subimage_provider.random_training_subimage_batch(
                    batch_size=self._batch_size)
                training_feed_dict = {
                    self._net_input: input_batch,
                    self._net_target: target_batch
                }
                sess.run(self._train_step, feed_dict=training_feed_dict)

                if (e_num + 1
                    ) % summary_every == 0 and u_num == updates_per_epoch - 1:
                    training_loss_value = sess.run(
                        self._loss_function, feed_dict=training_feed_dict)
                    training_loss_values.append(training_loss_value)
                    evaluation_loss_value = sess.run(
                        self._loss_function, feed_dict=evaluation_feed_dict)
                    evaluation_loss_values.append(evaluation_loss_value)
                    print(
                        '\nEpoch #: {} -> Training Loss: {}, Evaluation Loss: {}'
                        .format(e_num + 1, str(training_loss_value),
                                str(evaluation_loss_value)))

                    if log_writer is not None:
                        self._write_training_logs(log_writer, e_num,
                                                  training_feed_dict,
                                                  evaluation_feed_dict)

                    if self._save_file is not None and \
                            evaluation_loss_value < min_eval_loss and training_loss_value < min_train_loss:
                        min_train_loss = training_loss_value
                        min_eval_loss = evaluation_loss_value
                        self._saver.save(sess, self._save_file)
        print('\nMin training Loss: {}, Min evaluation Loss: {}'.format(
            str(min_train_loss), str(min_eval_loss)))
        return training_loss_values, evaluation_loss_values
Exemplo n.º 9
0
def download_file(url: str,
                  destination: Path,
                  http_headers: Optional[dict] = None,
                  proxies: Optional[dict] = None,
                  show_progress: bool = True):
    req = request.Request(url)
    if http_headers:
        for name, value in http_headers.items():
            req.add_header(name, value)
    if proxies:
        # TODO: Should we only set the proxy associated with the URL scheme?
        #  Should we raise an exception if there is not a proxy defined for
        #  the URL scheme?
        # parsed = parse.urlparse(url)
        for proxy_type, url in proxies.items():
            req.set_proxy(url, proxy_type)
    rsp = request.urlopen(req)

    size_str = rsp.getheader("content-length")
    total_size = int(size_str) if size_str else None
    block_size = 16 * 1024
    if total_size and total_size < block_size:
        block_size = total_size

    LOG.debug("Downloading url %s to %s", url, str(destination))

    if show_progress and progress:
        progress_bar = progress(total=total_size,
                                unit="b",
                                unit_scale=True,
                                unit_divisor=1024,
                                desc=f"Localizing {destination.name}")

        def progress_reader():
            buf = rsp.read(block_size)
            if buf:
                progress_bar.update(block_size)
            else:
                progress_bar.close()
            return buf

        reader = progress_reader
    else:
        reader = functools.partial(rsp.read, block_size)

    with open(destination, "wb") as out:
        while True:
            buf = reader()
            if not buf:
                break
            out.write(buf)
Exemplo n.º 10
0
    def download_file(
        self,
        destination: Path,
        show_progress: bool = False,
        digests: Optional[dict] = None
    ):
        total_size = self.get_content_length()
        block_size = 16 * 1024
        if total_size and total_size < block_size:
            block_size = total_size

        if show_progress and progress:
            progress_bar = progress(
                total=total_size,
                unit="b",
                unit_scale=True,
                unit_divisor=1024,
                desc=f"Localizing {destination.name}"
            )

            def progress_reader():
                b = self.read(block_size)
                if b:
                    progress_bar.update(block_size)
                else:
                    progress_bar.close()
                return b

            reader = progress_reader
        else:
            reader = functools.partial(self.read, block_size)

        downloaded_size = 0

        with open(destination, "wb") as out:
            while True:
                buf = reader()
                if not buf:
                    break
                downloaded_size += len(buf)
                out.write(buf)

        if downloaded_size != total_size:  # TODO: test this
            raise AssertionError(
                f"Size of downloaded file {destination} does not match expected size "
                f"{total_size}"
            )

        if digests:
            verify_digests(destination, digests)
Exemplo n.º 11
0
    def get_urls_from_pagination(self, server, pagination):
        for url in progress(pagination):
            base_url = 'https://www.tripadvisor.de'
            list = []

            response = server.get(url)
            response_in_lxml = bs(response.content, 'lxml')

            cities = response_in_lxml.find_all('ul',
                                               attrs={'class': 'geoList'})
            for city in cities:
                for li in city.find_all('li'):
                    link = li.find('a', href=True)['href']
                    list.append({
                        'link': base_url + link,
                    })
        return list
Exemplo n.º 12
0
    def _load_subimages(self, set_type, satellite, bands, progress_desc='Loading subimages'):
        # List of places folders for the given satellite
        satellite_places = [os.path.join(place, satellite) for place in self._list_places_folders(set_type)]
        subimage_count = None
        progress_bar = None
        subimages = None

        # One band at a time
        for b, band in enumerate(bands):
            # List of places folders for the given satellite and band
            band_places_folders = [os.path.join(satellite_place, str(band + 1)) for satellite_place in satellite_places]

            for i, band_place_folder in enumerate(band_places_folders):
                band_files = os.listdir(band_place_folder)
                band_files.sort(key=lambda s: reduce(lambda x, y: y + x * 10e5,
                                                     [int(elem) for elem in s[:-4].split('_')]))

                band_place_files = [os.path.join(band_place_folder, file) for file in band_files
                                    if os.path.isfile(os.path.join(band_place_folder, file))]

                # Complete images amount if uncertain, follows "each folder has the same amount of files" convention
                if subimage_count is None:
                    subimage_count = len(band_place_files) * len(satellite_places)

                # Initialize progress bar if uninitialized
                if progress_bar is None:
                    progress_bar = progress(total=subimage_count * len(bands), desc=progress_desc, unit='imgs')

                for j, band_place_file in enumerate(band_place_files):
                    band_image = imread(band_place_file)
                    # Initialize 4-dimensional array if uninitialized
                    if subimages is None:
                        image_width, image_height = band_image.shape
                        subimages = np.empty([subimage_count, image_height, image_width, len(bands)],
                                             dtype=np.uint16)

                    # Indexes as follows: n images from place 0; n images from place 1; ...; n images from place p.
                    # Where: n is the amount of images per folder and p the amount of places
                    index = j + (i * len(band_place_files))
                    subimages[index, :, :, b] = band_image
                    progress_bar.update()

        progress_bar.close()
        return subimages
Exemplo n.º 13
0
def animate(title, salleTest, results, fig, axe, Nt, dt):
    ''' This functions creates an animation and adds all the required legends
    Due to matplotlib limitations there are some trickled down technics to
    display a legend inside the code '''
    # Updates the screen
    def update(k, progress_bar):
        xs = []
        ys = []
        colors = []
        #print(k)
        axe.clear()
        axe.set_xlim(0, salleTest.Lx)
        axe.set_ylim(0, salleTest.Ly)
        axe.set_title(str(round(k * dt, 2)))
        salleTest.afficher(fig, axe)
        for agent_data in results[k]:
            color, x, y = agent_data
            xs.append(x)
            ys.append(y)
            colors.append(color)
        #print(xs)
        axe.scatter(xs, ys, c = colors)
        progress_bar.update(k)
        
        

    progress_bar = progress(range(Nt), desc = "Export vidéo")
        
    ''' Figure initialisation '''
    
    axe.set_xlim(0, salleTest.Lx)
    axe.set_ylim(0, salleTest.Ly)
    

    
    ani = animation.FuncAnimation(fig, update, fargs = (progress_bar,), frames = Nt, interval = dt * 1000, blit = False, repeat = True)

    
    return ani
Exemplo n.º 14
0
def merge_csv_files(file_1, file_2, file_3):
    '''
        @paramaters variant_call_file_csv, format_column_split_csv, final_merged_csv

        Using the variant_call_file_csv, and format_column_split_csv the two files are merged
        to create a final_merged_csv. The final csv will have the original VCF file format with the 
        additional columns which were split out from the NORMAL and TUMOR columns. 

        @return None 

    '''

    csv.field_size_limit(10000000)
    csv.field_size_limit()
    with open(file_1, 'r') as csv_1, open(file_2,
                                          'r') as csv_2, open(file_3,
                                                              'w') as out_file:
        reader_1 = csv.reader(csv_1)
        reader_2 = csv.reader(csv_2)
        writer = csv.writer(out_file)
        for row_1, row_2 in progress(zip(reader_1, reader_2)):
            writer.writerow(row_1 + row_2)
Exemplo n.º 15
0
    def get_urls_from_pagination(self, server, pagination):
        list = []
        for url in progress(pagination):
            base_url = 'https://www.tripadvisor.de'
            parsed = 0

            response = server.get(url)
            response_in_lxml = bs(response.content, 'lxml')

            cities = response_in_lxml.find_all('ul', attrs={'class': 'geoList'})
            for city in cities:
                for li in city.find_all('li'):
                    link = li.find('a', href=True)['href']
                    city_name = li.find('a').text
                    city_name = city_name.replace('Restaurants ', '')
                    list.append({
                        "country": "Germany",
                        "link": base_url + link,
                        "city_name": city_name,
                        "parsed": parsed
                    })
        return list
Exemplo n.º 16
0
def copy_blob_as_github_suggested(blob_url, copied_blob):
    """Copy append as block via github suggession."""
    i = 0
    running = 0
    chunk_size = 10 * 10 * 10 * 10 * 10 * 1024

    # Upload empty file
    copied_blob.upload_blob(b"")

    # Get File size
    resp = requests.get(blob_url, stream=True)
    total_size = int(resp.headers.get("content-length", 0))

    prog = progress(total=total_size, unit="iB", unit_scale=True)

    # Add step
    for step in range(total_size, 0, -chunk_size):
        offset = total_size - step
        length = chunk_size
        if step < chunk_size:
            length = step

        copied_blob.stage_block_from_url(
            block_id=i + 1,
            source_url=blob_url,
            source_offset=offset,
            source_length=length,
        )

        running += length
        i += 1
        prog.update(length)

    block_list = [BlobBlock(block_id=1)]
    copied_blob.commit_block_list(block_list)

    # committed, _ = copied_blob.get_block_list("all")

    prog.close()
Exemplo n.º 17
0
def copy_blob_as_blocks(blob_url, copied_blob):
    """Copy blob as blocks."""
    # Get target size
    resp = requests.get(blob_url, stream=True)
    total_size = int(resp.headers.get("content-length", 0))
    chunk_size = 10 * 10 * 10 * 10 * 1024

    # Upload empty file
    copied_blob.upload_blob(b"")

    i = 0
    running = 0
    prog = progress(total=total_size, unit="iB", unit_scale=True)
    for step in range(total_size, 0, -chunk_size):

        offset = total_size - step
        length = chunk_size
        if step < chunk_size:
            length = step

        # this will only stage your block
        copied_blob.stage_block_from_url(
            block_id=i + 1,
            source_url=blob_url,
            source_offset=offset,
            source_length=length,
        )

        # now it is committed
        running += length
        i += 1
        prog.update(length)

    copied_blob.commit_block_list([j + 1 for j in range(i)])

    prog.close()
    committed, _ = copied_blob.get_block_list("all")
    assert total_size == running
    assert total_size == len(committed)
Exemplo n.º 18
0
def sumSlicing(sum_list, len_years, verbose=False):
    '''This function generates all percentiles across a list.
    
    Args:
        sum_list (list): a list containing all the rainfall sum data.
        len_years (int): how many years to fit a gamma distribution
    
    Returns:
        list: a list of percentiles fitted to a gamma distribution
    '''
    if verbose: pbar = progress(total=len(sum_list)-len_years, leave=False)     # establish a nice progress bar
    leading_pointer = 0
    okazaki_pointer = len_years + 1
    percentile_list = []
    while okazaki_pointer <= len(sum_list):                         # iterate over every slice of the list that allows for adequate length
        data = sum_list[leading_pointer:okazaki_pointer]
        temp = percentile(data)
        percentile_list.append(temp)
        leading_pointer += 1
        okazaki_pointer += 1
        if verbose: pbar.update(1)  # update progress bar
    if verbose: pbar.close()

    return percentile_list
Exemplo n.º 19
0
def tokenize(texts,
             max_length,
             skip=-2,
             attr=LOWER,
             merge=False,
             nlp=None,
             **kwargs):
    """ Uses spaCy to quickly tokenize text and return an array
    of indices.

    This method stores a global NLP directory in memory, and takes
    up to a minute to run for the time. Later calls will have the
    tokenizer in memory.

    Parameters
    ----------
    text : list of unicode strings
        These are the input documents. There can be multiple sentences per
        item in the list.
    max_length : int
        This is the maximum number of words per document. If the document is
        shorter then this number it will be padded to this length.
    skip : int, optional
        Short documents will be padded with this variable up until max_length.
    attr : int, from spacy.attrs
        What to transform the token to. Choice must be in spacy.attrs, and =
        common choices are (LOWER, LEMMA)
    merge : int, optional
        Merge noun phrases into a single token. Useful for turning 'New York'
        into a single token.
    nlp : None
        A spaCy NLP object. Useful for not reinstantiating the object multiple
        times.
    kwargs : dict, optional
        Any further argument will be sent to the spaCy tokenizer. For extra
        speed consider setting tag=False, parse=False, entity=False, or
        n_threads=8.

    Returns
    -------
    arr : 2D array of ints
        Has shape (len(texts), max_length). Each value represents
        the word index.
    vocab : dict
        Keys are the word index, and values are the string. The pad index gets
        mapped to None

    >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"]
    >>> arr, vocab = tokenize(sents, 10, merge=True)
    >>> arr.shape[0]
    2
    >>> arr.shape[1]
    10
    >>> w2i = {w: i for i, w in vocab.iteritems()}
    >>> arr[0, 0] == w2i[u'do']  # First word and its index should match
    True
    >>> arr[0, 1] == w2i[u'you']
    True
    >>> arr[0, -1]  # last word in 0th document is a pad word
    -2
    >>> arr[0, 4] == w2i[u'class action lawsuit']  # noun phrase is tokenized
    True
    >>> arr[1, 1]  # The URL token is thrown out
    -2
    """
    if nlp is None:
        nlp = en.load()
    data = np.zeros((len(texts), max_length), dtype='int32')
    data[:] = skip
    bad_deps = ('amod', 'compound')
    token_list = []
    vocab = {}
    index = 0
    for row, doc in progress(enumerate(nlp.pipe(texts, **kwargs))):
        if merge:
            for phrase in doc.noun_chunks:
                while len(phrase) > 1 and phrase[0].dep_ not in bad_deps:
                    phrase = phrase[1:]
                if len(phrase) > 1:
                    phrase.merge(phrase.root.tag_, phrase.text,
                                 phrase.root.ent_type_)
                for ent in doc.ents:
                    if len(ent) > 1:
                        ent.merge(ent.root.tag_, ent.text, ent.label_)

        dat = doc.to_array([LOWER, LIKE_EMAIL, LIKE_URL]).astype("int32")
        for i, token in enumerate(doc):
            text = token.text.lower()
            if text not in list(vocab.values()):
                dat[i][0] = index
                vocab[index] = text
                index += 1
            else:
                for k, v in vocab.items():
                    if v == text:
                        value = k
                        break
                dat[i][0] = value
        if len(dat) > 0:
            msg = "Negative indices reserved for special tokens"
            assert dat.min() >= 0, msg
            idx = (dat[:, 1] > 0) | (dat[:, 2] > 0)
            dat[idx] = skip
            length = min(len(dat), max_length)
            data[row, :length] = dat[:length, 0].ravel()

    vocab[skip] = '<SKIP>'
    return data, vocab
Exemplo n.º 20
0
# Hardware
conditioner = analog.HeterodyneMarkII()
magnet = hardware.Thing('canceling_magnet',
                        {'orientation': 'up',
                         'distance_from_base_mm': 25})
hw = hardware.Hardware(conditioner, magnet)
ri = hardware_tools.r2_with_mk2()
ri.set_dac_atten(40)
ri.set_fft_gain(4)
ri.set_modulation_output('high')

# Run
ncf = acquire.new_nc_file(suffix='sweep_stream')
tic = time.time()
try:
    for lo in progress(lo_MHz):
        state = hw.state()
        state['temperature'] = {'package': temps.get_temperature_at(time.time())}
        tone_banks = (lo + offsets_MHz)[:, np.newaxis]  # Transform to shape (num_offsets, 1)
        ri.set_lo(lomhz=lo, chan_spacing=round_to_MHz)
        sweep_array = acquire.run_sweep(ri, tone_banks=tone_banks, num_tone_samples=num_tone_samples,
                                        length_seconds=sweep_length_seconds)
        single_sweep = sweep_array[0]
        f0_MHz = 1e-6 * single_sweep.resonator.f_0
        ri.set_tone_freqs(np.array([f0_MHz]), nsamp=num_tone_samples)
        ri.select_fft_bins(np.array([0]))
        stream_array = ri.get_measurement(num_seconds=stream_length_seconds)
        single_stream = stream_array[0]
        sweep_stream = basic.SingleSweepStream(sweep=single_sweep, stream=single_stream, state=state,
                                               description='f_0 = {:.1f}'.format(f0_MHz))
        ncf.write(sweep_stream)
Exemplo n.º 21
0
def body(sum_list, cmd_args):
    # calculate percentiles
    rainfall_percentiles = [sumSlicing(rainfall_sum, cmd_args.len_years, cmd_args.verbose) for rainfall_sum in progress(sum_list, desc='Calculating Percentiles')]
    if cmd_args.verbose or __name__ == '__main__':
        # print out year range
        _, columns = os.popen('stty size', 'r').read().split()
        fancy_sep = ['-' for _ in range(int(columns))]
        print(''.join(fancy_sep))                                   # allow for some eyeball breathing room
        print(f'This program calculated {len(rainfall_percentiles[0])} years worth of percentiles.\nThe list stored in "Rainfall Percentiles" represents data beginning in the year {1950+cmd_args.len_years}.\nThis is assuming that the first precip file contains data from the year 1950.')

    return rainfall_percentiles
Exemplo n.º 22
0
def main(args):
    '''
    Main Control Flow
    Note that the actual steps run are configured in the YAML input! This allows you to
    e.g. skip previously run steps.
    '''

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print("job started")

    with open(args[1], mode='r') as yaml_file:
        params, steps, = load_config(yaml_file.read())

    program_name = params['PROGRAM_NAME']
    fs = gcsfs.GCSFileSystem(token='google_default')
    legacy_tag = params['LEGACY_TAG']
    max_workers = params['MAX_WORKERS']
    add_normal_col = params['NORMAL_COL']

    # Directory to send each intermediary file to
    home = expanduser('~')
    variant_call_file_csv = f"{home}/NextGenETL/intermediateFiles/{params['PARSED_VARIANT_CALL_FILE']}"
    format_column_split_csv = f"{home}/NextGenETL/intermediateFiles/{params['FORMAT_COLUMN_SPLIT_FILE']}"
    final_merged_csv = f"{home}/NextGenETL/intermediateFiles/{params['FINAL_MERGED_CSV']}"
    format_information_file = f"{home}/NextGenETL/intermediateFiles/{params['FORMAT_INFO_FILE']}"
    dataframe_information_file = f"{home}/NextGenETL/intermediateFiles/{params['DATAFRAME_INFO_FILE']}"

    # Google Cloud Storage bucket path
    bucket_path = params['BUCKET_PATH']

    # Schemas
    schema_path = f"{home}/NextGenETL/intermediateFiles/{program_name.lower()}_simple_build_schema.json"

    # Staging table info for staging env
    staging_project = params['STAGING_PROJECT']
    staging_dataset_id = params['STAGING_DATASET_ID']
    staging_table_id = params['STAGING_TABLE_ID']
    scratch_full_table_id = f'{staging_project}.{staging_dataset_id}.{staging_table_id}'

    # Publish table info for production env
    publish_project = params['PUBLISH_PROJECT']
    publish_dataset_id = params['PUBLISH_DATASET_ID']
    publish_table_id = params['PUBLISH_TABLE_ID']
    schema_with_desc = schema_with_description(
        params['SCHEMA_WITH_DESCRIPTION'])

    # Path to Labels, Description, and FreindlyName
    labels_and_desc = params['LABEL_DESCRIPTION_FREINDLYNAME']

    if params is None:
        print("Bad YAML load")
        return

    if 'extract_metadata_table' in steps:
        print('* Extracting Meta-Data Table from Google BigQuery!')
        file_urls, project_short_names, file_names, analysis_workflow_types, case_barcodes, entity_ids = query_for_table(
            params['FILEDATA_ACTIVE'], params['GDCID_TO_GCSURL'],
            params['ALIQUOT_TO_CASEID'], program_name)

        print(f'Number of files to be processed: {len(file_urls)}')
        print(
            f'Number of projects in the program, {program_name}: {len(set(project_short_names))}'
        )
        print(
            f'Number of workflow types for the program, {program_name}: {len(set(analysis_workflow_types))}'
        )

        pbar = progress(total=len(file_urls))
        file_urls = iter(file_urls)
        project_short_names = iter(project_short_names)
        file_names = iter(file_names)
        analysis_workflow_types = iter(analysis_workflow_types)
        case_barcodes = iter(case_barcodes)
        entity_ids = iter(entity_ids)

    if 'transform_vcf' in steps:
        print('* Transforming and Parsing the VCF Files!')

        # Open an empty csv to store the vcf dataframes (Concatenated VCFs)
        with open(variant_call_file_csv, 'w') as out_file:
            pass

        with open(format_information_file, 'w') as format_out:
            pass

        with concurrent.futures.ProcessPoolExecutor(
                max_workers=max_workers) as executor:
            add_header = True
            futures = []
            start_process(next(file_urls),
                          next(project_short_names),
                          next(file_names),
                          next(analysis_workflow_types),
                          next(case_barcodes),
                          next(entity_ids),
                          fs,
                          variant_call_file_csv,
                          legacy_tag,
                          add_normal_col,
                          format_information_file,
                          add_header=add_header)
            pbar.update()

            running = set()
            for _, a_file, project_short_name, file_name, analysis_workflow_type, case_barcode, entity_id in zip(
                    range(max_workers), file_urls, project_short_names,
                    file_names, analysis_workflow_types, case_barcodes,
                    entity_ids):
                running.add(
                    executor.submit(start_process,
                                    a_file,
                                    project_short_name,
                                    file_name,
                                    analysis_workflow_type,
                                    case_barcode,
                                    entity_id,
                                    fs,
                                    variant_call_file_csv,
                                    legacy_tag,
                                    add_normal_col,
                                    format_information_file,
                                    add_header=False))
            while running:
                done, running = concurrent.futures.wait(
                    running, return_when=concurrent.futures.FIRST_COMPLETED)
                for _ in done:
                    pbar.update()
                for _, a_file, project_short_name, file_name, analysis_workflow_type, case_barcode, entity_id in zip(
                        range(len(done)), file_urls, project_short_names,
                        file_names, analysis_workflow_types, case_barcodes,
                        entity_ids):
                    running.add(
                        executor.submit(start_process,
                                        a_file,
                                        project_short_name,
                                        file_name,
                                        analysis_workflow_type,
                                        case_barcode,
                                        entity_id,
                                        fs,
                                        variant_call_file_csv,
                                        legacy_tag,
                                        add_normal_col,
                                        format_information_file,
                                        add_header=False))

    if 'create_new_columns' in steps:
        print('* Creating New Columns!')
        create_new_columns(variant_call_file_csv, format_column_split_csv)

    if 'merge_csv_files' in steps:
        print('* Merging CSV Files!')
        merge_csv_files(variant_call_file_csv, format_column_split_csv,
                        final_merged_csv)

    if 'build_a_simple_schema' in steps:
        print('* Generating a Simple Schema! ')
        simple_schema_builder(program_name, final_merged_csv,
                              dataframe_information_file, home)

    if 'push_csv_to_bucket' in steps:
        print('* Pushing CSV File to Bucket!')
        push_file_to_bucket(final_merged_csv, bucket_path)

    if 'load_to_staging_environment' in steps:
        print('* Loading a Table in to a Staging Environment!')
        load_to_staging_env(staging_dataset_id, staging_table_id, bucket_path,
                            schema_path)

    if 'load_to_production_environment' in steps:
        print('* Loading a Table in to a Production Environment!')
        load_to_production_env(publish_project, publish_dataset_id,
                               publish_table_id, schema_with_desc,
                               scratch_full_table_id, labels_and_desc)
Exemplo n.º 23
0
def create_new_columns(file_1, file_2):
    '''
        @parameters file1, file2

        This function will take a csv file with the formated gtf file and 
        parse out the 'attribute' column. The parsed information in the attribute 
        column will be transformed into new columns of their own and be written out
        to a csv file. 

        @return None 

    '''

    csv.field_size_limit(10000000)
    csv.field_size_limit()
    with open(file_1) as file_in:
        reader = csv.reader(file_in)
        header = next(reader)
        format_column_index = header.index('FORMAT')
        column_names = set()
        for row in progress(reader):
            cell_information = row[format_column_index]
            column_names.update(cell_information.split(':'))
    column_names = list(column_names)
    num_cols = len(column_names)

    with open(file_1) as file_in:
        with open(file_2, 'w') as file_out:
            reader = csv.reader(file_in)
            writer = csv.writer(file_out)
            header = next(reader)

            if 'NORMAL' in header:
                format_column_index = header.index('FORMAT')
                normal_column_index = header.index('NORMAL')
                tumor_column_index = header.index('TUMOR')
                writer.writerow([f'{name}_Normal' for name in column_names] +
                                [f'{name}_Tumor' for name in column_names])

                for row in progress(reader):
                    columns = row[format_column_index].split(':')
                    tumor_col_values = row[tumor_column_index].split(':')
                    if row[normal_column_index] != '':
                        normal_col_values = row[normal_column_index].split(':')
                        column_indicies = [
                            column_names.index(column) for column in columns
                        ]
                        row_out = [''] * (num_cols * 2)
                        for column_index, normal_value, tumor_value in zip(
                                column_indicies, normal_col_values,
                                tumor_col_values):
                            row_out[column_index] = normal_value
                            row_out[column_index + num_cols] = tumor_value
                        writer.writerow(row_out)
                    else:
                        column_indicies = [
                            column_names.index(column) for column in columns
                        ]
                        row_out = [''] * (num_cols * 2)
                        for column_index, tumor_value in zip(
                                column_indicies, tumor_col_values):
                            row_out[column_index + num_cols] = tumor_value
                        writer.writerow(row_out)
            else:
                format_column_index = header.index('FORMAT')
                tumor_column_index = header.index('TUMOR')
                writer.writerow([f'{name}_Tumor' for name in column_names])
                for row in progress(reader):
                    columns = row[format_column_index].split(':')
                    tumor_col_values = row[tumor_column_index].split(':')
                    column_indicies = [
                        column_names.index(column) for column in columns
                    ]
                    row_out = [''] * (num_cols)
                    for column_index, tumor_value in zip(
                            column_indicies, tumor_col_values):
                        row_out[column_index] = tumor_value
                    writer.writerow(row_out)
Exemplo n.º 24
0
def body(cmd_args):
    '''This function runs the main functionality
    
    Args:
        cmd_args (argparse.Namespace): an argparse namespace
    
    Returns:
        GeoDataFrame: a GeoPandas GeoDataFrame with all of the rainfall sums included.
    '''
    # parse month range
    month_range = fp.cropCalendarParser(cmd_args.unit_code)
    month_range = [int(month) for month in month_range]
    # get precip data
    precip_data = importPrecipData(month_range, windows=cmd_args.windows, testing=cmd_args.testing)
    # get geodata
    st_coords = fp.precipFileParser('./resources/precip_data/precip.1977', [4, 8], return_coords=True)
    gdf = fp.shapeFileParser(cmd_args.shapefile_path, st_coords, cmd_args, testing=cmd_args.testing)
    # generate rainfall totals
    station_indices = gdf['Station Indices'].tolist()
    rainfall_totals = [generateRainFallSums(index_list, data) for index_list, data in progress(zip(station_indices, itertools.repeat(precip_data)), total=len(gdf['Station Indices']), desc='Calculating rainfall sums')]
    gdf['Rainfall Totals'] = rainfall_totals
    # print out needed calculation stats
    station_lengths = [len(lst) for lst in station_indices]     # how many stations were captured
    _, columns = os.popen('stty size', 'r').read().split()
    fancy_sep = ['-' for _ in range(int(columns))]
    print(''.join(fancy_sep))                                   # allow for some eyeball breathing room
    print(f'The average number of captured stations was {round(statistics.mean(station_lengths), 2)}')
    if 0 in station_lengths:                                    # warn if any location didn't capture data
        cprint('::ATTENTION::', 'red', attrs=['reverse', 'blink'])
        print(f'{station_lengths.count(0)}/{len(station_lengths)} locations did not capture a single precip station. This will *likely* be addressed in the final csv file.')
    else: print('Every location captured at least one precip station.')

    return gdf
Exemplo n.º 25
0
def upload(request):
    wait = 0  # Sleeper for CRUD during upload
    accountids = []  # Holds current document "unique=True" fields
    added = 0

    if 'GET' == request.method:
        memberdata = Conflict.objects.all()
        context = {'memberdata': memberdata}
        return render(request, 'upload.html', context)
    try:
        # First Condition: Make sure there is a file
        csv_file = request.FILES["csv_file"]
        if len(csv_file) == 0:
            messages.error(request, 'Empty File')
            return render(request, 'upload.html')
        # Second Condition: Make sure the file is a CSV file
        if not csv_file.name.endswith('.csv'):
            messages.error(request, 'File is not CSV type')
            return render(request, 'upload.html')
        # Prepare Data for parsing
        file_data = csv_file.read().decode("utf-8")
        lines = file_data.split("\n")

        # Begin loop through CSV file lines
        for index, line in progress(enumerate(lines)):
            fields = line.split(",")
            if index == 0:
                # Third Condition: Check if top row CVS fields are as expected
                if (fields[0]
                        == 'first_name') and (fields[1] == 'last_name') and (
                            fields[2] == 'phone_number') and (
                                fields[3]
                                == 'client_member_id') and (fields[4]
                                                            == 'account_id'):
                    pass
                # Throw an Error if document headers don't match
                else:
                    messages.error(request, 'File is not Correct Headers')
                    return render(request, 'upload.html')
                    break

            # Save as member if not in database, otherwise save as conflict
            if (len(fields[0]) != 0) and (len(fields[1]) != 0) and (len(
                    fields[2]) != 0) and (len(fields[3]) != 0):
                # Check if current data appeared in current document
                duplicate = False
                if fields[3] in accountids:
                    duplicate = True
                if fields[2] in accountids:
                    duplicate = True
                if fields[2] == 'phone_number':
                    duplicate = True
                # Save to database if doesn't already exist
                if duplicate == False:
                    try:
                        data = Member(
                            first=fields[0],
                            last=fields[1],
                            telephone=fields[2],
                            clientid=fields[3],
                            accountid=fields[4],
                        )
                        # Tracking system for current document items
                        accountids.append(fields[3])
                        accountids.append(fields[2])
                        added += 1
                        data.save()
                        # Sleeper allows for CRUD operations during upload
                        # A more robust version would be using multiprocessors
                        if wait == 5:
                            time.sleep(.2)
                            wait = 0
                        else:
                            wait += 1
                    except:
                        if wait == 5:
                            time.sleep(.2)
                            wait = 0
                        else:
                            wait += 1
                        pass

                # 10.A If the person is in system
                elif duplicate == True:  # When commented this block will not allow
                    conf = Conflict(  # duplicates. This block will sort
                        first=fields[
                            0],  # duplicates into another table called
                        last=fields[1],  # "conflicts" for further handling.
                        telephone=fields[2],
                        clientid=fields[3],
                        accountid=fields[4],
                    )
                    conf.save()
        # Message at top of screen once members are added
        messages.success(request, "Successfully Uploaded CSV File")
        return redirect('/upload')
    # If one of the coditions weren't met
    except Exception as e:
        messages.error(request, "Unable to upload file. " + e)
        return redirect('/upload')
Exemplo n.º 26
0
conditioner = analog.HeterodyneMarkII()
magnet = hardware.Thing('canceling_magnet', {
    'orientation': 'up',
    'distance_from_base_mm': 25
})
hw = hardware.Hardware(conditioner, magnet)
ri = hardware_tools.r2_with_mk2()
ri.set_dac_atten(40)
ri.set_fft_gain(4)
ri.set_modulation_output('high')

# Run
ncf = acquire.new_nc_file(suffix='sweep')
tic = time.time()
try:
    for lo in progress(lo_MHz):
        state = hw.state()
        state['temperature'] = {
            'package': temps.get_temperature_at(time.time())
        }
        tone_banks = np.array([np.array([f]) for f in lo + offsets_MHz])
        ri.set_lo(lomhz=lo, chan_spacing=round_to_MHz)
        sweep = acquire.run_sweep(ri,
                                  tone_banks=tone_banks,
                                  num_tone_samples=num_tone_samples,
                                  length_seconds=length_seconds,
                                  state=state)
        ncf.write(sweep)
finally:
    ncf.close()
    print("Wrote {}".format(ncf.root_path))
    def _load_normalized_signatures(self, signatures_directory, num_of_classes,
                                    signatures_per_class):
        images_have_been_normalized = True
        model_has_been_created = True
        self._signatures = []
        self._signatures_features = []

        progress_value = progress(total=num_of_classes * signatures_per_class,
                                  desc='Loading signatures',
                                  unit=' images')

        if not os.path.exists(signatures_directory + '_normalized'):
            os.mkdir(signatures_directory + '_normalized')
            images_have_been_normalized = False

        if not os.path.exists(signatures_directory + '_normalized_model'):
            os.mkdir(signatures_directory + '_normalized_model')
            model_has_been_created = False

        for class_folder_index in range(1, num_of_classes + 1):
            class_folder_number = '0'

            if class_folder_index < 10:
                class_folder_number += str(class_folder_index)
            else:
                class_folder_number = str(class_folder_index)

            self._signatures_features.append([])
            self._signatures.append([])

            if not images_have_been_normalized:
                os.mkdir(signatures_directory + '_normalized' + '/p' +
                         class_folder_number)

            if not model_has_been_created:
                os.mkdir(signatures_directory + '_normalized_model' + '/p' +
                         class_folder_number)

            for signature_file_index in range(1, signatures_per_class + 1):
                signature_file = '0'

                if signature_file_index < 10:
                    signature_file += str(signature_file_index)
                else:
                    signature_file = str(signature_file_index)

                if not model_has_been_created:
                    if not images_have_been_normalized:
                        loaded_image = Image.open(signatures_directory + '/p' +
                                                  class_folder_number + '/p' +
                                                  class_folder_number + 's' +
                                                  signature_file + '.png')
                        loaded_image = loaded_image.resize((250, 250),
                                                           Image.ANTIALIAS)
                        loaded_image = loaded_image.convert('L')
                        loaded_image.save(signatures_directory +
                                          '_normalized' + '/p' +
                                          class_folder_number + '/p' +
                                          class_folder_number + 's' +
                                          signature_file + '.png')
                        signature_image = loaded_image
                    else:
                        signature_image = Image.open(signatures_directory +
                                                     '_normalized' + '/p' +
                                                     class_folder_number +
                                                     '/p' +
                                                     class_folder_number +
                                                     's' + signature_file +
                                                     '.png')

                    signature_model = self._extract_signature_features_vector(
                        np.asarray(signature_image))
                    self._signatures_features[class_folder_index -
                                              1].append(signature_model)
                    self._signatures[class_folder_index -
                                     1].append(signature_image)

                    signature_model_file = open(
                        signatures_directory + '_normalized_model' + '/p' +
                        class_folder_number + '/p' + class_folder_number +
                        's' + signature_file + '.csv', "+a")
                    for element in signature_model:
                        signature_model_file.write("%f\n" % element)

                else:
                    signature_model_lines = open(
                        signatures_directory + '_normalized_model' + '/p' +
                        class_folder_number + '/p' + class_folder_number +
                        's' + signature_file + '.csv', "r")
                    signature_model_lines = signature_model_lines.readlines()
                    signature_model = []
                    for line in signature_model_lines:
                        signature_model.append(float(line))
                    self._signatures_features[class_folder_index -
                                              1].append(signature_model)

                progress_value.update()