def load_shit(file_paths, save_path): for i, path in enumerate(file_paths): # first iteration if i == 0: print('[info] initializing phrase model') with open(path) as f: reader = ndjson.reader(f) reader = extract_text(reader) # initialize phrase model phrases = Phrases(reader, delimiter=b" ") # every other iteration else: if i % 1000: progress = (i / len(file_paths)) * 100 print('[info] processed {}% files'.format(round(progress, 1))) with open(path) as f: reader = ndjson.reader(f) reader = extract_text(reader) # show the model new data phrases.add_vocab(reader) # save model after iterations are done with open(save_path, 'w') as f: phrases.save(save_path)
def lemmatize_file(path, nlp, outdir, fname): ''' ''' with open(path) as f: reader = ndjson.reader(f) reader = extract_text(reader) docs = nlp.pipe(reader, n_threads=16, batch_size=10000) spacy_tokens = [] for doc in docs: doc_features = {} doc_features.update({ 'text': [token.text for token in doc], 'lemma': [token.lemma_ for token in doc], 'pos': [token.pos_ for token in doc], 'dep': [token.dep_ for token in doc], 'ner': [token.ent_type_ for token in doc] }) spacy_tokens.append(doc_features) del docs with open(os.path.join(outdir, fname), 'w') as fout: ndjson.dump(spacy_tokens, fout) return None
def parse_records(self, file_name, **kwargs): """Provide an iterator the reads the NDJSON records""" with open(file_name) as fh: for row in ndjson.reader(fh): wkt = shape(row["geometry"]).wkt row["geometry"] = f"SRID={self.srid};{wkt}" yield row
def test_reader(data): text = '\n'.join(json.dumps(item) for item in data) + '\n' fp = six.StringIO(text) reader = ndjson.reader(fp) read_items = [item for item in reader] assert read_items == data
def load_data_files(directory: str) -> Iterator[Dict]: filepath: str = "/".join((directory, DATA_FILENAME)) with open(filepath) as f: reader = ndjson.reader(f) d: dict for d in reader: yield d
def feature_loader(self, path: str) -> pd.DataFrame: """ Function to create a dataframe from source files. """ with open(path) as f: reader = ndjson.reader(f) for post in reader: df = pd.DataFrame([post], columns=post.keys()) self.dataframe = pd.concat([self.dataframe, df], axis=0, ignore_index=True) return self.dataframe
def connect(self): if self.reader is None and self.writer is None: if self.access == AccessMode.READ: self.file_handle = open(self.engine_params.file_path, "r", encoding=self.encoding) self.file_size = os.stat(self.engine_params.file_path).st_size self.reader = ndjson.reader(self.file_handle) elif self.access == AccessMode.WRITE: self.file_handle = open(self.engine_params.file_path, "w", encoding=self.encoding) self.writer = ndjson.writer(self.file_handle) else: raise ValueError("Unknown access mode")
def test_format_tweet_as_csv_row(self): output = StringIO() writer = csv.writer(output) writer.writerow(TWEET_FIELDS) with open_resource('tweet-export.jsonl') as f: for item in ndjson.reader(f): tweet = item['_source'] row = format_tweet_as_csv_row(tweet, item_id=item['_id']) assert len(row) == len(TWEET_FIELDS) writer.writerow(row) with open_resource('tweet-export.csv') as f: output.seek(0) assert list(csv.reader(output)) == list(csv.reader(f))
def create_from_local_file(cls, client, project_id: str, name: str, file: Path, validate_file=True) -> 'BulkImportRequest': """ Creates a BulkImportRequest from a local ndjson file with predictions. Args: client (Client): a Labelbox client project_id (str): id of project for which predictions will be imported name (str): name of BulkImportRequest file (Path): local ndjson file with predictions validate_file (bool): a flag indicating if there should be a validation if `file` is a valid ndjson file Returns: BulkImportRequest object """ file_name = _make_file_name(project_id, name) content_length = file.stat().st_size request_data = _make_request_data(project_id, name, content_length, file_name) with file.open('rb') as f: if validate_file: reader = ndjson.reader(f) # ensure that the underlying json load call is valid # https://github.com/rhgrant10/ndjson/blob/ff2f03c56b21f28f7271b27da35ca4a8bf9a05d0/ndjson/api.py#L53 # by iterating through the file so we only store # each line in memory rather than the entire file try: _validate_ndjson(reader) except ValueError: raise ValueError(f"{file} is not a valid ndjson file") else: f.seek(0) file_data = (file.name, f, NDJSON_MIME_TYPE) response_data = _send_create_file_command(client, request_data, file_name, file_data) return cls(client, response_data["createBulkImportRequest"])
def test_transform_tweet_into_csv_dict(self): output = StringIO() writer = csv.DictWriter( output, fieldnames=TWEET_FIELDS, extrasaction='ignore', restval='', quoting=csv.QUOTE_MINIMAL ) writer.writeheader() with open_resource('tweet-export.jsonl') as f: for item in ndjson.reader(f): tweet = item['_source'] transform_tweet_into_csv_dict( tweet, item_id=item['_id'] ) writer.writerow(tweet) with open_resource('tweet-export.csv') as f: output.seek(0) assert list(csv.DictReader(output)) == list(csv.DictReader(f))
def ndjson_to_list(datafile): with open(datafile) as f: reader = ndjson.reader(f) data = [line for line in reader] return (data)
from imblearn.under_sampling import RandomUnderSampler ############################################################################### ##### Corpus ##### ############################################################################### #### keys ## all #'overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'reviewerName', # 'reviewText', 'summary', 'unixReviewTime' ## optional # vote ratings = [] reviews = [] summaries = [] with open("data/raw/Video_Games_5.json", "r") as infile: reader = ndjson.reader(infile) for review in reader: try: rating = review["overall"] rv = review["reviewText"] s = review["summary"] except Exception as e: continue if len(rv) > 0 and len(s) > 0: ratings.append(rating) reviews.append(rv) summaries.append(s) ############################################################################### ##### The distribution of stars #####
""" Created on Thu Jun 4 10:44:04 2020 @author: Marco Polignano """ import pandas as pd import ndjson import numpy as np from config import TRAINING_SET_PATH dataframe = pd.DataFrame() #LOADING TRAINING SET with open(TRAINING_SET_PATH) as f: reader = ndjson.reader(f) for post in reader: df = pd.DataFrame([post], columns=post.keys()) dataframe = pd.concat([dataframe, df], axis=0, ignore_index=True) print(dataframe) X = dataframe['sentence'] y = dataframe['score'] #RMSE function def rmse(predictions, targets): return np.sqrt(((predictions - targets)**2).mean())
def get_jsonl_resource(name): with open(join(RESOURCES_DIR, name), encoding='utf-8') as f: return list(ndjson.reader(f))