Пример #1
0
    def test_buffered_writer_wrapper_works(self):
        """
        Ensure that we can wrap a smart_open gcs stream in a BufferedWriter, which
        passes a memoryview object to the underlying stream in python >= 2.7
        """
        expected = u'не думай о секундах свысока'

        with smart_open.gcs.Writer(BUCKET_NAME, WRITE_BLOB_NAME) as fout:
            with io.BufferedWriter(fout) as sub_out:
                sub_out.write(expected.encode('utf-8'))

        with smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME),
                             'rb') as fin:
            with io.TextIOWrapper(fin, encoding='utf-8') as text:
                actual = text.read()

        self.assertEqual(expected, actual)
Пример #2
0
    def _publish_last_updated(self):
        """Write the timestamp when file of the dataset were last modified to GCS."""
        last_updated_path = (f"api/{self.api_version}/tables/{self.dataset}/"
                             f"{self.table}/{self.version}/last_updated")
        output_file = f"gs://{self.target_bucket}/{last_updated_path}"

        logging.info(f"Write last_updated to {output_file}")

        with smart_open.open(output_file, "w") as fout:
            last_updated = self.last_updated.strftime("%Y-%m-%d %H:%M:%S")
            fout.write(json.dumps(last_updated))

        # set Content-Type to json so that timestamp is displayed in the browser
        blob = self.storage_client.get_bucket(
            self.target_bucket).get_blob(last_updated_path)
        blob.content_type = "application/json"
        blob.patch()
Пример #3
0
def transit_groupby_csv(session,
                        params,
                        s3_filename,
                        key,
                        agg,
                        transport_params,
                        chunk_size=1e6):
    """
    ###  https://maxhalford.github.io/blog/streaming-groupbys-in-pandas-for-big-datasets/ ###

    pandas function to reduce memory usage. Data needs to be SORTED by a key and is processed by chunk_size batches.
    this can lead to hanging or orphan keys which is also handled.

    In our case data is grouped by lrimoshipno and sorted by movementdatetime


    :param session: boto3 session to access athena data
    :param params: boto3 parameter dictionary
    :param s3_filename: file to process after running get_data
    :param key: data has to be sorted by this key
    :param agg: data processing function
    :param chunk_size: number of rows per batch for processing
    :return: dataframe with potential port coordinates
    """
    chunks = pd.read_csv(open('s3://' + params['bucket'] + '/' +
                              params['path'] + '/' + s3_filename,
                              transport_params=transport_params),
                         chunksize=chunk_size,
                         parse_dates=['movementdatetime'])
    results = []
    orphans = pd.DataFrame()
    for chunk in tqdm(chunks):
        # Add the previous orphans to the chunk
        chunk = pd.concat((orphans, chunk))

        # Determine which rows are orphans
        last_val = chunk[key].iloc[-1]
        is_orphan = chunk[key] == last_val

        # Put the new orphans aside
        chunk, orphans = chunk[~is_orphan], chunk[is_orphan]

        # Perform the aggregation and store the results
        result = agg(chunk)
        results.append(result)
    return pd.concat(results)
Пример #4
0
    def run(self):
        """Starts extracting data from the source files

        Loops over each source file passing it to the users scrapers `self.extract` method.
        Passing in the source files raw content

        If all the sources need to be passed in and extracted at the same time, then the user may
        override this method to do so.
        """
        logger.info("Start Extract",
                    extra={'task': self.task,
                           **self.scraper.log_extras(),
                           'time_started': self.time_extracted,
                           })

        for source_idx, source_file in enumerate(self._get_sources()):
            raw_source = None
            transport_params = {}
            if source_file.startswith('s3://'):
                transport_params = _get_s3_params(self.scraper,
                                                  context_type='downloader')

            with open(source_file, 'r', transport_params=transport_params) as f:
                raw_source = f.read()

            try:
                extraction_tasks = self._get_extraction_tasks(raw_source, source_idx)
                if not extraction_tasks:
                    continue

                for extraction_task in extraction_tasks:
                    extraction_task(raw_source)

            except Exception as e:
                logger.exception(f"Extraction Failed: {e}",
                                 extra={'task': self.task,
                                        'source_file': source_file,
                                        **self.scraper.log_extras(),
                                        **get_root_exc_log_overides(),
                                        })

        logger.debug('Extract finished',
                     extra={'task': self.task,
                            **self.scraper.log_extras(),
                            'time_finished': datetime.datetime.utcnow().isoformat() + 'Z',
                            })
Пример #5
0
def test_gcs_performance_small_reads(benchmark):
    initialize_bucket()

    ONE_MIB = 1024**2
    one_megabyte_of_msgs = io.BytesIO()
    msg = b'\x0f' + b'0123456789abcde'  # a length-prefixed "message"
    for _ in range(0, ONE_MIB, len(msg)):
        one_megabyte_of_msgs.write(msg)
    one_megabyte_of_msgs = one_megabyte_of_msgs.getvalue()

    key = _GCS_URL + '/many_reads_performance.bin'

    with smart_open.open(key, 'wb') as fout:
        fout.write(one_megabyte_of_msgs)

    actual = benchmark(read_length_prefixed_messages, key, 'rb', buffering=ONE_MIB)
    assert actual == one_megabyte_of_msgs
async def _fetch_data_from_s3(bucket, key, context):
    """
        Stream data from S3 bucket. Create batches of size MAX_PAYLOAD_SIZE
        and create async requests from batches
    """
    log_file_size = boto3.resource('s3').Bucket(
        bucket).Object(key).content_length
    if log_file_size > MAX_FILE_SIZE:
        logger.error(
            "The log file uploaded to S3 is larger than the supported max size of 400MB")
        return

    s3MetaData = {
        "invoked_function_arn": context.invoked_function_arn,
        "s3_bucket_name": bucket
    }
    log_file_url = "s3://{}/{}".format(bucket, key)
    async with aiohttp.ClientSession() as session:
        log_batches = []
        batch_request = []
        batch_counter = 1
        log_batch_size = 0
        start = time.time()
        with open(log_file_url, encoding='utf-8') as log_lines:
            for index, log in enumerate(log_lines):
                log_batch_size += sys.getsizeof(log)
                if index % 500 == 0:
                    logger.debug(f"index: {index}")
                log_batches.append(log)
                if log_batch_size > (MAX_BATCH_SIZE * BATCH_SIZE_FACTOR):
                    logger.debug(f"sending batch: {batch_counter}")
                    data = {"context": s3MetaData, "entry": log_batches}
                    batch_request.append(create_log_payload_request(data, session))
                    if len(batch_request) >= REQUEST_BATCH_SIZE:
                        await asyncio.gather(*batch_request)
                        batch_request = []
                    log_batches = []
                    log_batch_size = 0
                    batch_counter += 1
        data = {"context": s3MetaData, "entry": log_batches}
        batch_request.append(create_log_payload_request(data, session))
        logger.info("Sending data to NR logs.....")
        output = await asyncio.gather(*batch_request)
        end = time.time()
        logger.debug(f"time elapsed to send to NR Logs: {end - start}")
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--uuids', help='.txt file with hca-util submission uuids (uuid only). 1 uuid per line')
    parser.add_argument('--num_reads', default= 1000, help='number of reads to test')

    args = parser.parse_args()

    # Check if path ends with / or not and retrieve bucket key
    bucket_name = 'hca-util-upload-area'

    uuids = pd.read_csv(args.uuids,header=None)
    uuids = list(uuids[0])

    for uuid in uuids:

        s3 = boto3.resource('s3')
        my_bucket = s3.Bucket(bucket_name)
        keys = ['s3://hca-util-upload-area/' + str(s3_object.key) for s3_object in my_bucket.objects.all()]
        filenames = [key for key in keys if uuid in key]
        filenames = filenames[1:]
	filenames = [file for file in filenames if '.fastq.gz' in file]

        my_dict = {}
        for filename in filenames:
            my_dict[filename] = {}
            with smart_open.open(filename) as f:
                count = 0
                len_seqs = []
                records = SeqIO.parse(f, 'fastq')
                for record in records:
                    if count < args.num_reads:
                        len_seqs.append(len(str(record.seq)))
                        count += 1
                    else:
                        break
                len_uniq = list(set(len_seqs))
                for uniq in len_uniq:
                    num = len_seqs.count(uniq)
                    my_dict[filename].update({uniq: num})

        data = pd.DataFrame.from_dict(my_dict, orient='index')
        out_file = uuid + "_read_lengths.txt"
        data.to_csv(out_file,sep="\t")
        print("Done processing uuid: %s" % (uuid))
Пример #8
0
    def __init__(self, filename, validate_file=False, limit=None):
        """
        filename: the vocabulary file.  It is a flat text file with one
            (normalized) token per line.  In addition, the file should also
            contain the special tokens <S>, </S>, <UNK> (case sensitive). Can be None.
        limit: process only the first <limit> words from the file; can be useful at inference
        (we assume the vocabulary is sorted by frequency).
        """
        self._id_to_word = []
        self._word_to_id = {}
        self._unk = -1
        self._bos = -1
        self._eos = -1

        if filename:
            vocab_source = open(filename, 'r')  # Loading vocabulary from file
        else:
            logging.info(
                "No vocabulary file provided; using special tokens only.")
            vocab_source = ["<S>", "</S>",
                            "<UNK>"]  # Creating a toy vocabulary ourselves
        idx = 0
        for line in vocab_source:
            word_name = line.strip()
            if word_name == '<S>':
                self._bos = idx
            elif word_name == '</S>':
                self._eos = idx
            elif word_name == '<UNK>':
                self._unk = idx
            if word_name == '!!!MAXTERMID':
                continue

            self._id_to_word.append(word_name)
            self._word_to_id[word_name] = idx
            idx += 1
            if idx == limit:
                break
        logging.info(
            f"We will cache the vocabulary of {len(self._id_to_word)} tokens.")
        # check to ensure file has special tokens
        if validate_file:
            if self._bos == -1 or self._eos == -1 or self._unk == -1:
                raise ValueError("Ensure the vocabulary file has "
                                 "<S>, </S>, <UNK> tokens")
Пример #9
0
def upload_from_url(url, s3_key, on_stream_opened=None):
    bucket = app.config['LOCH_S3_BUCKET']
    s3_url = build_s3_url(s3_key)
    with requests.get(url, stream=True) as response:
        if response.status_code != 200:
            app.logger.error(
                f'Received unexpected status code, aborting S3 upload '
                f'(status={response.status_code}, body={response.text}, key={s3_key} url={url})'
            )
            raise ConnectionError(
                f'Response {response.status_code}: {response.text}')
        if on_stream_opened:
            on_stream_opened(response.headers)
        try:
            s3_upload_args = {
                'ServerSideEncryption': app.config['LOCH_S3_ENCRYPTION']
            }
            if s3_url.endswith('.gz'):
                s3_upload_args.update({
                    'ContentEncoding': 'gzip',
                    'ContentType': 'text/plain',
                })
            session = get_session()
            # smart_open needs to be told to ignore the .gz extension, or it will smartly attempt to double-compress it.
            with smart_open.open(
                    s3_url,
                    'wb',
                    ignore_ext=True,
                    transport_params=dict(
                        session=session,
                        multipart_upload_kwargs=s3_upload_args),
            ) as s3_out:
                for chunk in response.iter_content(chunk_size=1024):
                    s3_out.write(chunk)
        except (ClientError, ConnectionError, ValueError) as e:
            app.logger.error(
                f'Error on S3 upload: source_url={url}, bucket={bucket}, key={s3_key}, error={e}'
            )
            raise e
    s3_response = get_client().head_object(Bucket=bucket, Key=s3_key)
    if s3_response:
        app.logger.info(
            f'S3 upload complete: source_url={url}, bucket={bucket}, key={s3_key}'
        )
        return s3_response
Пример #10
0
def find_secrets():
    # create empty dataframe
    df = pd.DataFrame(columns=[
        'cik', 'date', 'company_name', 'form_type', 'filename', 'total',
        'has_secret'
    ] + secret_list + ['has_protection'] + protect_list)
    progress = 0

    client = connect_s3()

    # iterate through files
    df_mf = pd.read_csv('master_file_list_subset_clean.csv')
    for i in tqdm(df_mf.index):
        f = df_mf.at[i, 'Filename'].replace('edgar/data/', '')
        with open(f's3://sec-filings-v2/{f}',
                  transport_params={'client': client}) as f_in:
            contents = f_in.read()

        sec_header, ten_k_body = text_preprocessing(contents)
        fyear = extract_fyear(sec_header)
        sic = extract_sic(sec_header)

        text_list = tokenize(ten_k_body)
        # search for the keywords
        counts_dict = get_count(text_list)
        counts_dict['cik'] = df_mf.at[i, 'CIK']
        counts_dict['date'] = df_mf.at[i, 'Date Filed']
        counts_dict['company_name'] = df_mf.at[i, 'Company Name']
        counts_dict['filename'] = f
        counts_dict['form_type'] = df_mf.at[i, 'Form Type']

        df = df.append(counts_dict, ignore_index=True)

        if progress % 100 == 0 and progress >= 100:
            df_temp = df.sort_values(by=['cik', 'date'])
            csv_to_s3(df_temp, client, f'keywords_temp_{progress}.csv')
            if progress != 100:
                client.delete_object(Bucket='10k-output',
                                     Key=f'keywords_temp_{progress-100}.csv')

        progress += 1

    df = df.sort_values(by=['cik', 'date'])
    csv_to_s3(df, client, f'keywords_final.csv')
    stop_ec2()
Пример #11
0
def read_file(fpath: str, **kwargs) -> str:
    """
    Read file with `smart_open` from file path.

    Parameters
    -----------
    fpath: str
        File path.
    kwargs: optional
        Other `smart_open` support params. 

    Returns
    --------
        data string of the file.
    """
    with smart_open.open(fpath, **kwargs) as f:
        data = f.read()
    return data
Пример #12
0
    def __iter__(self):
        jieba.enable_parallel(8)
        for filename in self.file_list:
            with open(self.root_path + filename, encoding='utf-8') as f:
                for line in f:
                    words = self._process(line)
                    if not words or len(
                            words
                    ) < 2:  # less than 2 words won't contain 2 cities
                        continue
                    words, cities = self._retrieve_cities(words)
                    # get unique cities
                    cities = list(set(cities))
                    if len(cities
                           ) < 2:  # less than 2 cities won't composite a link
                        continue
#                     yield {'words': self.dictionary.doc2bow(words), 'cities': cities}
                    yield {'words': words, 'cities': cities}
Пример #13
0
 def restore_spilled_objects(self, object_refs: List[ObjectRef],
                             url_with_offset_list: List[str]):
     for i in range(len(object_refs)):
         object_ref = object_refs[i]
         url_with_offset = url_with_offset_list[i].decode()
         # Retrieve the information needed.
         parsed_result = parse_url_with_offset(url_with_offset)
         base_url = parsed_result.base_url
         offset = parsed_result.offset
         # Read a part of the file and recover the object.
         with open(base_url, "rb") as f:
             f.seek(offset)
             metadata_len = int.from_bytes(f.read(8), byteorder="little")
             buf_len = int.from_bytes(f.read(8), byteorder="little")
             self._size_check(metadata_len, buf_len, parsed_result.size)
             metadata = f.read(metadata_len)
             # read remaining data to our buffer
             self._put_object_to_store(metadata, buf_len, f, object_ref)
Пример #14
0
 def process(self, something):
     clear_data = []
     with open(self.input_path) as fin:
         data = fin.read()
         products_list = ast.literal_eval(data)
         for prod in products_list:
             product_id = prod.get("id")
             product_name = prod.get("name")
             product_price = prod.get("price")
             created_at = prod.get("created_at")
             currency = prod.get("currency")
             clear_data.append([
                 product_id, product_name, product_price, currency,
                 created_at
             ])
     print(clear_data)
     logging.getLogger().setLevel(logging.INFO)
     yield clear_data
Пример #15
0
    def __build_sentences(corpus_path, data_path):
        sentences_path = '%s/sentences.txt' % data_path

        if not os.path.exists(sentences_path):
            # 读入语料并做分词,然后保存分词文件
            sentences = []
            with open(corpus_path, 'r') as corpus_file:
                with open(sentences_path, 'w') as sentences_file:
                    reader = csv.reader(corpus_file)
                    index = 0
                    for row in reader:
                        index += 1
                        sentences.append(' '.join(jieba.cut(row[1])))
                        if index % 2000 == 0:
                            sentences_file.write('\n'.join(sentences))
                            sentences.clear()

        return word2vec.LineSentence(smart_open.open(sentences_path))
Пример #16
0
def add_tfidf_records(
        s3_filename='s3://hanks-bda-2020-01/data-output/tfidf/000000_0'):
    res = boto3.resource('dynamodb')
    table = res.Table('tfidf')
    create_table()
    with open(s3_filename, 'rb') as fin:
        i = 1
        for line in fin:
            strvalue = line.decode('utf-8').strip()
            doc_id, term, value = strvalue.split('\x01')
            table.put_item(Item={
                'term': term,
                'doc_id': doc_id,
                'value': Decimal(value)
            })
            if (i % 1000) == 0:
                print(f"I {i}")
            i += 1
Пример #17
0
def unpickle(fname):
    """Load object from `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc.

    Parameters
    ----------
    fname : str
        Path to pickle file.

    Returns
    -------
    object
        Python object loaded from `fname`.

    """
    with open(fname, 'rb') as f:
        return _pickle.load(
            f, encoding='latin1'
        )  # needed because loading from S3 doesn't support readline()
Пример #18
0
def get_paths_from_csv(_fnfn,
                       path_key=PATH_KEY,
                       path_ext=PATH_EXT,
                       path_prefix='',
                       path_suffix='',
                       sep='\t'):
    paths = []
    #with codecs.open(_fnfn,encoding='utf-8') as pf:
    if not path_key: path_key = DEFAULT_PATH_KEY
    with open(_fnfn) as pf:
        reader = csv.DictReader(pf, delimiter=sep)
        for dx in reader:
            path = dx.get(path_key, '')
            if not path: continue
            if path_prefix: path = os.path.join(path_prefix, path)
            if path_suffix: path = path + path_suffix
            if path: paths += [path]
    return paths
Пример #19
0
def parallel_read(old, fname):
    old_val, treedef = jax.tree_flatten(old)
    with open(fname, "rb") as f:
        buf = f.read()
        f_io = io.BytesIO(buf)
        loaded = np.load(f_io)

    new_vals = []
    for i in loaded:
        new_vals.append(loaded[i])

    for o, n in zip(new_vals, old_val):
        assert o.shape == n.shape, "Incompatible checkpoint"

        if o.dtype == np.dtype('V2'):
            o.dtype = jnp.bfloat16

    return jax.tree_unflatten(treedef, new_vals)
Пример #20
0
def open_s3(uri: str, *args: Any, **kwargs: Any) -> smart_open.open:
    """Stream an s3 key for read / write operations.

    This is a wrapper around smart_open.open which allows us to fine-tune
    access control for testing.
    """
    transport_params = {
        'resource_kwargs': {
            'endpoint_url': _ENDPOINT_URL,
        },
        'ExtraArgs': {
            'ServerSideEncryption': 'AES256'
        }
    }
    return smart_open.open(uri,
                           transport_params=transport_params,
                           *args,
                           **kwargs)
Пример #21
0
    def write_file_to_storage(self, records):
        #Get parameters to pass to the smart_open open function
        transport_params = {
            'session': self.session,
            'resource_kwargs': {
                'endpoint_url': self.endpoint_url,
            }
        }

        #Construct the storage URI
        storage_uri = 's3://%s/%s.tsv.gz' % (self.s3_bucket, self.s3_key)

        #Write records to S3
        with smart_open.open(storage_uri,
                             'w',
                             transport_params=transport_params) as fout:
            file_writer = csv.writer(fout, delimiter='\t', lineterminator='\n')
            file_writer.writerows(records)
Пример #22
0
 def output_samples(self, filename, n=None):
     with torch.no_grad():
         for name, samples in (('cat', self.categorical_samples),
                               ('cont', self.continuous_samples)):
             if samples is None:
                 continue
             grid_imgs = self.trainer.target_g(samples)
             grid_filename = os.path.join(
                 os.path.dirname(filename),
                 f'info_{name}_{os.path.basename(filename)}')
             nrow = samples.shape[1]
             with smart_open.open(grid_filename, 'wb') as output_file:
                 torchvision.utils.save_image(grid_imgs,
                                              output_file,
                                              nrow=nrow,
                                              normalize=True,
                                              range=(-1, 1),
                                              format='png')
Пример #23
0
def wem_export_years():

    for year in [2020, 2019]:
        json_envelope = []

        energy = wem_energy_year(year)
        market_value = wem_market_value_year(year)

        json_envelope = energy + market_value

        year_path = BASE_EXPORT + f"/wem/energy/daily/{year}.json"

        with open(
                year_path,
                "w",
                transport_params=dict(multipart_upload_kwargs=UPLOAD_ARGS),
        ) as fh:
            json.dump(json_envelope, fh, cls=NemEncoder)
Пример #24
0
def mergesort(sorted_filenames, columns, nway=2, tmp_dir='', encoding='utf-8'):
    """Merge these 2 sorted csv files into a single output file
    """
    merge_n = 0
    while len(sorted_filenames) > 1:
        merge_filenames, sorted_filenames = sorted_filenames[:nway], sorted_filenames[nway:]

        output_filename = os.path.join(tmp_dir, 'merge{}.csv'.format(merge_n))
        with open(output_filename, 'w', newline='\n', encoding=encoding) as output_fp:
            writer = csv.writer(output_fp)
            merge_n += 1
            rows = (yield_csv_rows(filename, columns, encoding) for filename in merge_filenames)
            writer.writerows(heapq.merge(*rows))
        sorted_filenames.append(output_filename)

        for filename in merge_filenames:
            os.remove(filename)
    return sorted_filenames[0]
Пример #25
0
    def _collect(self, data_dir: str) -> Iterator[str]:  # type: ignore
        """
        This function will collect the files of the given directory. If the
         'suffix' field in the config is set, it will only take files matching
         that suffix. See :func:`~forte.data.readers.RecursiveDirectory
         DeserializeReader.default_configs` for the default configs.

        Args:
            data_dir: The root directory to search for the data packs.

        Returns: Iterator of the data pack string from the directory.
        """
        for root, _, files in os.walk(data_dir):
            for file in files:
                if not self.configs.suffix or file.endswith(
                        self.configs.suffix):
                    with open(os.path.join(root, file)) as f:
                        yield f.read()
Пример #26
0
def download_file(src: Union[str, "Pathy"],
                  dest: Path,
                  *,
                  force: bool = False) -> None:
    """Download a file using smart_open.
    url (str): The URL of the file.
    dest (Path): The destination path.
    force (bool): Whether to force download even if file exists.
        If False, the download will be skipped.
    """
    import smart_open

    if dest.exists() and not force:
        return None
    src = str(src)
    with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
        with dest.open(mode="wb") as output_file:
            output_file.write(input_file.read())
Пример #27
0
    def test_write_03b(self):
        """Does writing a last chunk size equal to a multiple of the min_part_size work?"""
        min_part_size = 256 * 1024
        smart_open_write = smart_open.gcs.Writer(
            BUCKET_NAME, WRITE_BLOB_NAME, min_part_size=min_part_size
        )
        expected = b"t" * min_part_size * 2

        with smart_open_write as fout:
            fout.write(expected)
            self.assertEqual(fout._current_part.tell(), 262144)
            self.assertEqual(fout._total_parts, 1)

        # read back the same key and check its content
        with smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME)) as fin:
            output = fin.read().encode('utf-8')

        self.assertEqual(output, expected)
Пример #28
0
    def _open(self, path_or_uri, mode):
        from smart_open import open

        if isinstance(path_or_uri, LocalGitFile):
            import git

            return io.BytesIO(
                git.Repo(path_or_uri.repo_path).git.show("{}:{}".format(
                    path_or_uri.ref, path_or_uri.path)).encode())

        if isinstance(path_or_uri, SourceFile):
            path_or_uri = path_or_uri.get_path_or_uri()

        try:
            return open(path_or_uri, mode)
        except Exception as e:
            raise WorkflowError(
                "Failed to open source file {}".format(path_or_uri), e)
Пример #29
0
def _train_tokenizer(store: BaseConfig) -> spm.SentencePieceProcessor:
    """
    Trains SentencePiece tokenizer on training data
    """
    logging.info("Training SentencePiece tokenizer")
    spm.SentencePieceTrainer.Train(
        input=store.training_data,
        model_prefix=store.tokenizer_prefix,
        user_defined_symbols=["<n>", store.field_delimiter_token],
        vocab_size=store.vocab_size,
        hard_vocab_limit=False,
        max_sentence_length=store.max_line_len,
        character_coverage=store.character_coverage)
    """
    spm.SentencePieceTrainer.Train(
        f'--input={store.training_data} '
        f'--model_prefix={store.tokenizer_prefix} '
        f'--user_defined_symbols=<n>,{store.field_delimiter_token} '
        f'--vocab_size={store.vocab_size} '
        f'--hard_vocab_limit=false '
        f'--character_coverage={store.character_coverage}')
    """
    _move_tokenizer_model(store)

    sp = spm.SentencePieceProcessor()
    logging.info(f"Loading tokenizer from: {Path(store.tokenizer_model).name}")
    sp.Load(store.tokenizer_model)

    # print sample output
    with open(store.training_data) as f:
        sample = f.readline().strip()
    logging.info(f"Tokenizer model vocabulary size: {len(sp)} tokens")
    logging.info(
        'Mapping first line of training data\n\n{}\n ---- sample tokens mapped to pieces ---- > \n{}\n'
        .format(repr(sample),
                ", ".join(sp.SampleEncodeAsPieces(sample, -1, 0.1))))
    logging.info(
        'Mapping first line of training data\n\n{}\n ---- sample tokens mapped to int ---- > \n{}\n'
        .format(repr(sample),
                ", ".join([str(idx) for idx in sp.EncodeAsIds(sample)])))
    logging.info(
        f"Saving SentencePiece model to {store.tokenizer_prefix}.model and {store.tokenizer_prefix}.vocab"
    )
    return sp
def load_data(node_id, node_emb, tq_emb, qc, qc_loc, FLAGS, filename):
    output_X, output_R, output_y = [], [], []
    n_fea = 2 * FLAGS.emb_dim + 2
    with open(filename, 'r') as fp:
        for line in fp:
            data = line.strip().split('\t')
            queries = [data[i] for i in range(1, len(data), 3)]
            embs = [
                get_tq_emb(node_id, node_emb, tq_emb, FLAGS, q)
                for q in queries
            ]
            # qembs = [ node_emb[node_id[('query', q)]] if ('query', q) in node_id else np.zeros(FLAGS.emb_dim) for q in queries]
            data = np.array([])
            for i in range(len(queries)):
                # data = np.append(data, [math.log10(i + 1)])
                # data = np.append(data, qembs[i])
                data = np.append(data, embs[i])
                data = np.append(
                    data, embs[i] -
                    embs[i - 1] if i > 0 else np.zeros(FLAGS.emb_dim))
            for i in range(1, len(queries)):
                if (queries[i - 1], queries[i]) not in qc_loc:
                    continue

                L = max(0, (i - FLAGS.max_len)) * n_fea
                R = i * n_fea
                X = data[L:R]
                if X.size < n_fea * FLAGS.max_len:
                    X = np.append(np.zeros(n_fea * FLAGS.max_len - X.size), X)
                assert (X.size == n_fea * FLAGS.max_len)

                for c in qc[queries[i - 1]]:
                    R = np.array([])
                    cemb = get_tq_emb(node_id, node_emb, tq_emb, FLAGS, c)
                    R = np.append(R, cemb)
                    R = np.append(R, cemb - embs[i - 1])

                    output_X.append(X)
                    output_R.append(R)
                    output_y.append(1.0 if c == queries[i] else 0.0)

    output_y = np.array(output_y).reshape((len(output_y), 1))

    return np.array(output_X), np.array(output_R), output_y