def load_shit(file_paths, save_path):
    for i, path in enumerate(file_paths):
        # first iteration
        if i == 0:
            print('[info] initializing phrase model')
            with open(path) as f:
                reader = ndjson.reader(f)
                reader = extract_text(reader)
                # initialize phrase model
                phrases = Phrases(reader, delimiter=b" ")

        # every other iteration
        else:
            if i % 1000:
                progress = (i / len(file_paths)) * 100
                print('[info] processed {}% files'.format(round(progress, 1)))

            with open(path) as f:
                reader = ndjson.reader(f)
                reader = extract_text(reader)
                # show the model new data
                phrases.add_vocab(reader)

    # save model after iterations are done
    with open(save_path, 'w') as f:
        phrases.save(save_path)
def lemmatize_file(path, nlp, outdir, fname):
    '''
    '''
    with open(path) as f:
        reader = ndjson.reader(f)
        reader = extract_text(reader)

        docs = nlp.pipe(reader, n_threads=16, batch_size=10000)

        spacy_tokens = []
        for doc in docs:
            doc_features = {}
            doc_features.update({
                'text': [token.text for token in doc],
                'lemma': [token.lemma_ for token in doc],
                'pos': [token.pos_ for token in doc],
                'dep': [token.dep_ for token in doc],
                'ner': [token.ent_type_ for token in doc]
            })
            spacy_tokens.append(doc_features)

        del docs

    with open(os.path.join(outdir, fname), 'w') as fout:
        ndjson.dump(spacy_tokens, fout)

    return None
Exemplo n.º 3
0
 def parse_records(self, file_name, **kwargs):
     """Provide an iterator the reads the NDJSON records"""
     with open(file_name) as fh:
         for row in ndjson.reader(fh):
             wkt = shape(row["geometry"]).wkt
             row["geometry"] = f"SRID={self.srid};{wkt}"
             yield row
Exemplo n.º 4
0
def test_reader(data):
    text = '\n'.join(json.dumps(item) for item in data) + '\n'

    fp = six.StringIO(text)
    reader = ndjson.reader(fp)

    read_items = [item for item in reader]

    assert read_items == data
Exemplo n.º 5
0
def load_data_files(directory: str) -> Iterator[Dict]:

    filepath: str = "/".join((directory, DATA_FILENAME))

    with open(filepath) as f:

        reader = ndjson.reader(f)

        d: dict
        for d in reader:

            yield d
Exemplo n.º 6
0
    def feature_loader(self, path: str) -> pd.DataFrame:
        """
        Function to create a dataframe from source files.
        """
        with open(path) as f:
            reader = ndjson.reader(f)

            for post in reader:
                df = pd.DataFrame([post], columns=post.keys())
                self.dataframe = pd.concat([self.dataframe, df],
                                           axis=0,
                                           ignore_index=True)

        return self.dataframe
Exemplo n.º 7
0
    def connect(self):
        if self.reader is None and self.writer is None:

            if self.access == AccessMode.READ:
                self.file_handle = open(self.engine_params.file_path, "r", encoding=self.encoding)
                self.file_size = os.stat(self.engine_params.file_path).st_size
                self.reader = ndjson.reader(self.file_handle)

            elif self.access == AccessMode.WRITE:
                self.file_handle = open(self.engine_params.file_path, "w", encoding=self.encoding)
                self.writer = ndjson.writer(self.file_handle)

            else:
                raise ValueError("Unknown access mode")
Exemplo n.º 8
0
    def test_format_tweet_as_csv_row(self):
        output = StringIO()

        writer = csv.writer(output)
        writer.writerow(TWEET_FIELDS)

        with open_resource('tweet-export.jsonl') as f:
            for item in ndjson.reader(f):
                tweet = item['_source']
                row = format_tweet_as_csv_row(tweet, item_id=item['_id'])

                assert len(row) == len(TWEET_FIELDS)

                writer.writerow(row)

        with open_resource('tweet-export.csv') as f:
            output.seek(0)
            assert list(csv.reader(output)) == list(csv.reader(f))
Exemplo n.º 9
0
    def create_from_local_file(cls,
                               client,
                               project_id: str,
                               name: str,
                               file: Path,
                               validate_file=True) -> 'BulkImportRequest':
        """
        Creates a BulkImportRequest from a local ndjson file with predictions.

        Args:
            client (Client): a Labelbox client
            project_id (str): id of project for which predictions will be imported
            name (str): name of BulkImportRequest
            file (Path): local ndjson file with predictions
            validate_file (bool): a flag indicating if there should be a validation
                if `file` is a valid ndjson file
        Returns:
            BulkImportRequest object

        """
        file_name = _make_file_name(project_id, name)
        content_length = file.stat().st_size
        request_data = _make_request_data(project_id, name, content_length,
                                          file_name)

        with file.open('rb') as f:
            if validate_file:
                reader = ndjson.reader(f)
                # ensure that the underlying json load call is valid
                # https://github.com/rhgrant10/ndjson/blob/ff2f03c56b21f28f7271b27da35ca4a8bf9a05d0/ndjson/api.py#L53
                # by iterating through the file so we only store
                # each line in memory rather than the entire file
                try:
                    _validate_ndjson(reader)
                except ValueError:
                    raise ValueError(f"{file} is not a valid ndjson file")
                else:
                    f.seek(0)
            file_data = (file.name, f, NDJSON_MIME_TYPE)
            response_data = _send_create_file_command(client, request_data,
                                                      file_name, file_data)
        return cls(client, response_data["createBulkImportRequest"])
Exemplo n.º 10
0
    def test_transform_tweet_into_csv_dict(self):
        output = StringIO()
        writer = csv.DictWriter(
            output,
            fieldnames=TWEET_FIELDS,
            extrasaction='ignore',
            restval='',
            quoting=csv.QUOTE_MINIMAL
        )
        writer.writeheader()

        with open_resource('tweet-export.jsonl') as f:
            for item in ndjson.reader(f):
                tweet = item['_source']
                transform_tweet_into_csv_dict(
                    tweet,
                    item_id=item['_id']
                )

                writer.writerow(tweet)

        with open_resource('tweet-export.csv') as f:
            output.seek(0)
            assert list(csv.DictReader(output)) == list(csv.DictReader(f))
Exemplo n.º 11
0
def ndjson_to_list(datafile):
    with open(datafile) as f:
        reader = ndjson.reader(f)
        data = [line for line in reader]
    return (data)
Exemplo n.º 12
0
from imblearn.under_sampling import RandomUnderSampler

###############################################################################
#####                                Corpus                               #####
###############################################################################
#### keys
## all
#'overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'reviewerName',
# 'reviewText', 'summary', 'unixReviewTime'
## optional
# vote
ratings = []
reviews = []
summaries = []
with open("data/raw/Video_Games_5.json", "r") as infile:
    reader = ndjson.reader(infile)

    for review in reader:
        try:
            rating = review["overall"]
            rv = review["reviewText"]
            s = review["summary"]
        except Exception as e:
            continue
        if len(rv) > 0 and len(s) > 0:
            ratings.append(rating)
            reviews.append(rv)
            summaries.append(s)

###############################################################################
#####                     The distribution of stars                       #####
Exemplo n.º 13
0
"""
Created on Thu Jun  4 10:44:04 2020

@author: Marco Polignano
"""
import pandas as pd
import ndjson
import numpy as np

from config import TRAINING_SET_PATH

dataframe = pd.DataFrame()

#LOADING TRAINING SET
with open(TRAINING_SET_PATH) as f:
    reader = ndjson.reader(f)

    for post in reader:
        df = pd.DataFrame([post], columns=post.keys())
        dataframe = pd.concat([dataframe, df], axis=0, ignore_index=True)

print(dataframe)

X = dataframe['sentence']
y = dataframe['score']


#RMSE function
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets)**2).mean())
Exemplo n.º 14
0
def get_jsonl_resource(name):
    with open(join(RESOURCES_DIR, name), encoding='utf-8') as f:
        return list(ndjson.reader(f))