예제 #1
0
 def test_excel_to_ndjson(self):
     out_f = self.output_folder / "test_excel_to_ndjson_out.ndjson"
     self.converter.convert_file(self.input_folder / 'sample.csv', 'ndjson',
                                 out_f)
     self.assertEqual(
         ndjson.load(out_f.open()),
         ndjson.load((self.input_folder / 'sample_null.ndjson').open()))
예제 #2
0
def get_vbb_data(centre):
	global stations
	global station_types
	g = Graph()
	with open('nodes.ndjson') as f:
		dataSta = ndjson.load(f)

	# convert to and from objects
	textSta = ndjson.dumps(dataSta)
	dataSta = ndjson.loads(textSta)
	for i in dataSta:
		#tupel = str(i['metadata']['x'])+","+str(i['metadata']['y'])
		x = float(i['metadata']['longitude'])
		y = float(i['metadata']['latitude'])
		idSt = str(i['id'])
		g.add_node(idSt)
		stations[idSt] = (x, y)
		# g.add_node(tupel)

	with open('edges.ndjson') as f:
		dataDist = ndjson.load(f)

	# convert to and from objects
	textDist = ndjson.dumps(dataDist)
	dataDist = ndjson.loads(textDist)

	for i in dataDist:
		stationA = str(i['source'])
		stationB = str(i['target'])
		distance = int(i['metadata']['time'])
		line = i['metadata']['line']
		if line.startswith('RB') or line.startswith('RB'):
			station_types[stationA] = 1
			station_types[stationB] = 1
		elif line.startswith('U') or line.startswith('S'):
			if stationA in station_types:
				if station_types[stationA] > 1:
					station_types[stationA] = 2
			else:
				station_types[stationA] = 2
			if stationB in station_types:
				if station_types[stationB] > 1:
					station_types[stationB] = 2
			else:
				station_types[stationB] = 2
		else:
			if stationA in station_types:
				if station_types[stationA] > 2:
					station_types[stationA] = 3
			else:
				station_types[stationA] = 3

			if stationB in station_types:
				if station_types[stationB] > 2:
					station_types[stationB] = 3
			else:
				station_types[stationB] = 3
		g.add_edge(stationA, stationB, distance)

	return dijsktra(g, centre)  # Station name of Dabendorf node: 900000245024
예제 #3
0
 def test_csv_semicolon_to_ndjson(self):
     self.df_handler.convert_file(self.input_folder /
                                  'invoice_semicolon_delimiter.csv',
                                  self.out_file_path,
                                  'csv',
                                  'ndjson',
                                  read_options={'sep': ';'})
     self.assertCountEqual(
         ndjson.load(self.out_file_path.open()),
         ndjson.load((self.input_folder / 'invoice.ndjson').open()))
예제 #4
0
 def test_excel_one_sheet_to_ndjson(self):
     self.df_handler.convert_file(self.input_folder /
                                  'invoice_multi_sheets.xlsx',
                                  self.out_file_path,
                                  'excel',
                                  'ndjson',
                                  read_options={'sheet_name': 'Sheet2'})
     self.assertCountEqual(
         ndjson.load(self.out_file_path.open()),
         ndjson.load(
             (self.input_folder / 'invoice_id_reversed.ndjson').open()))
예제 #5
0
 def test_csv_to_ndjson_with_aito_schema(self):
     with (self.input_folder /
           'invoice_aito_schema_altered.json').open() as f:
         schema_altered = json.load(f)
     self.df_handler.convert_file(self.input_folder / 'invoice.csv',
                                  self.out_file_path,
                                  'csv',
                                  'ndjson',
                                  use_table_schema=schema_altered)
     self.assertCountEqual(
         ndjson.load(self.out_file_path.open()),
         ndjson.load((self.input_folder / 'invoice_altered.ndjson').open()))
예제 #6
0
def loadEventJson(path, jsonlInput):
    '''This automatically supports files that have been compressed with `gzip`.'''
    if (jsonlInput):
        import ndjson as json
    else:
        import json

    try:
        with gzip.open(path, FILE_MODE_READ_TEXT) as gid:
            all_recs = json.load(gid)
    except OSError:
        with open(path, FILE_MODE_READ) as gid:
            all_recs = json.load(gid)

    return all_recs
예제 #7
0
    def __init__(self, primary_files, min_count, supporting_files=None):

        self.negatives = []
        self.discards  = []
        self.negpos    = 0

        self.word2id        = dict()
        self.id2word        = dict()
        self.word_frequency = dict()

        self.token_count        = 0
        self.max_num_words_file = 0

        self.primary_files      = primary_files
        self.supporting_files   = supporting_files

        self.file_paths = primary_files if supporting_files is None else primary_files + supporting_files
        
        self.data = None

        if self.ndJson:
            with open(self.primary_files) as f:
                self.data = ndjson.load(f)
            self.readWordsNdJson(min_count)
        else:
            self.readWords(min_count)
            
        self.initTableNegatives()
        self.initTableDiscards()
def get_segmented_reviews(retrievepath, savepath):
    '''This method takes in retrieve path to get the data source json file
    and dump the data into other file in save path, after doing sentence
    segmentation on reviews.'''

    # open the json file and read the reviews in.
    # The file is actually ndjson(seperated by newlines not commas)
    try:
        with open(retrievepath, encoding='latin-1') as f:
            datastore = ndjson.load(f)
    except IOError:
        print('An error occurred trying to read the file.')

    # using sent_tokenize() to split a review text into a list of sentences.
    for review in datastore:
        review['text'] = sent_tokenize(review['text'])
        # number of sentence in each review text
        review['num_sentence'] = len(review['text'])

    # save the sengmented comments to data folder for further analysis
    try:
        with open(savepath, 'w+') as f:
            ndjson.dump(datastore, f)
    except IOError:
        print('An error occurred trying to save the file.')
예제 #9
0
def generate_class(filename):
    CLASS_NAME = filename.replace('.ndjson',
                                  '').replace('full_simplified_', '')
    print(f'Begin generating {CLASS_NAME} images...')

    try:
        os.mkdir(f'{OUTPUT_BASE_DIR}{CLASS_NAME}')
    except:
        pass  # Ignore if dir already exists

    with open(f'{INPUT_DIR}{filename}') as f:
        data = ndjson.load(f)
        print(f"Size of {CLASS_NAME}s set: {len(data)}")

        count = 0
        for sample in data:
            if sample['recognized'] != True: continue
            canvas = generate_image(sample)
            canvas = resize_image(canvas)
            save_image(canvas, CLASS_NAME, count)
            count += 1
            if count % (NUMBER_OF_IMAGES / 4) == 0:
                print(
                    f"Generated {(count*100)/NUMBER_OF_IMAGES}% of {CLASS_NAME} images..."
                )
            if count == NUMBER_OF_IMAGES: break

    print(f'Finished generating {CLASS_NAME} images.')
    return
예제 #10
0
def process_kibana_object(obj_type, exportpath, indexpattern=None):
    print('# Processing kibana object: %s' % obj_type)

    if obj_type != 'index-pattern':
        src_file_name = '%s%s' % (EXPORT_FILES_PREFIX_KIBANA, obj_type)
    else:
        if indexpattern is None:
            for i in INDEX_PATTERNS_FILTER.split('|'):
                process_kibana_object(obj_type, exportpath, indexpattern=i)
            return
        else:
            src_file_name = '%s%s_%s' % (EXPORT_FILES_PREFIX_KIBANA, obj_type, indexpattern)

    src_file = os.path.join(exportpath, '%s.ndjson' % src_file_name)
    diff_file = os.path.join(exportpath, DIFF_PATH, '%s.json' % src_file_name)
    print('\tOpening %s: %s' % (obj_type, src_file))
    with open(src_file, 'r') as f:
        src_ndjson = ndjson.load(f)

    for s in src_ndjson:
        if obj_type == 'index-pattern':
            s['attributes']['fields'] = sorted(json.loads(s['attributes']['fields']), key=lambda x : x['name'])
        elif obj_type == 'search':
            s['attributes']['kibanaSavedObjectMeta']['searchSourceJSON'] = json.loads(s['attributes']['kibanaSavedObjectMeta']['searchSourceJSON'])
        elif obj_type == 'visualization':
            s['attributes']['kibanaSavedObjectMeta']['searchSourceJSON'] = json.loads(s['attributes']['kibanaSavedObjectMeta']['searchSourceJSON'])
            s['attributes']['visState'] = json.loads(s['attributes']['visState'])
        elif obj_type == 'dashboard':
            s['attributes']['kibanaSavedObjectMeta']['searchSourceJSON'] = json.loads(s['attributes']['kibanaSavedObjectMeta']['searchSourceJSON'])
            s['attributes']['optionsJSON'] = json.loads(s['attributes']['optionsJSON'])
            s['attributes']['panelsJSON'] = json.loads(s['attributes']['panelsJSON'])

    print('\tWriting output to: %s' % diff_file)
    with open(diff_file, 'w') as f:
        json.dump(src_ndjson, f, indent=4, sort_keys=True)
예제 #11
0
def parse_label(filename, path=RAW_DIR_NAME, decode=None):
    """
    Helper for parse_dataset: parses a single .ndjson file associated with the
    specified path
    @param filename (str): string specifying the path to the .ndjson file to
                            parse
    @param decode (None or "jpg"): whether to decode sketches as images. By
                            default, sketches are saved as ndjson files.
    @param path - str: folder where training examples will be stored.
    """
    list_ids = []
    label, _ = os.path.splitext(filename)

    full_filename = os.path.join(path, filename)
    with open(full_filename) as f:
        if decode == 'jpg':
            dir_name = os.path.join(path, '../img/' + label)
        else:
            dir_name = os.path.join(path, label)
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        drawings = ndjson.load(f)
        for drawing in drawings:
            example_filename = save_training_example(drawing, dir_name, decode)
            list_ids.append(example_filename)
    return list_ids
def extract_scenes(paths, outdir):
    '''
    Just extract scenic comments and save them somewhere
    '''

    # iterate through files
    for path in tqdm(paths):
        # get file name
        fname = os.path.basename(path)
        outpath = os.path.join(outdir, fname)

        # open file
        with open(path) as f:
            session = ndjson.load(f)

        # process all docs in that file
        scenes = []
        for doc in session:
            # split texts and scenic comments
            scene, text = extract_scenic_comments(doc, tokentype='text')
            scenes.append(scene)

        # export
        try:
            with open(outpath) as f:
                ndjson.dump(scenes, f)
        except FileNotFoundError:
            pass
def infodynamics_plots(ntr_paths, model_dir):
    '''
    Make adaptline & regline for each result/window
    '''
    # for every path given
    for path in ntr_paths:
        # get extract window
        w = re.findall(r'\d+', os.path.basename(path))
        assert len(w) == 1
        w = w[0]

        print('[info] plotting window {}'.format(w))

        # open infodynamics results
        with open(path) as f:
            ntr_df = pd.DataFrame(ndjson.load(f))
        
        try:
            plot_window(
                ntr_df=ntr_df,
                w=w,
                model_dir=model_dir
            )
        except np.linalg.LinAlgError:
            print('{} is a singular matrix'.format(w))
def get_label_vocab(train_data_path):
    with open(train_data_path) as i:
        d = ndjson.load(i)

    labels = list(set([i["label"] for i in d]))
    label_vocab = {label:i for i,label in enumerate(labels)}
    return label_vocab
예제 #15
0
파일: 1up.py 프로젝트: crawforc3/misc_code
    def lookup_references(self):
        """Looks up references to a patient"""
        resource_paths = self.load_resources()
        self.references = {}

        for resource_path in resource_paths:
            with open(resource_path, 'r') as inf:
                resources = ndjson.load(inf)

            for resource in resources:
                if 'patient' in resource.keys():
                    if resource['patient']['reference'].split(
                            '/')[1] == self.patient_id:
                        resource_type = resource['resourceType']
                        if resource_type not in self.references.keys():
                            self.references[resource_type] = 1
                        else:
                            self.references[resource_type] += 1
                elif 'subject' in resource.keys():
                    if resource['subject']['reference'].split(
                            '/')[1] == self.patient_id:
                        resource_type = resource['resourceType']
                        if resource_type not in self.references.keys():
                            self.references[resource_type] = 1
                        else:
                            self.references[resource_type] += 1
예제 #16
0
파일: 1up.py 프로젝트: crawforc3/misc_code
    def lookup_encounters(self):
        with open('./data/Encounter.ndjson', 'r') as inf:
            encounter_file = ndjson.load(inf)

        for i, encounter in enumerate(encounter_file):
            #print(encounter)

            patient_reference = patient = encounter['subject'][
                'reference'].split('/')[1]
            #print(patient_reference, patient_id)
            if patient_reference == self.patient_id:
                print("cool")

                print(encounter['resourceType'])

                patient = encounter['subject']['reference']

                practitioner = encounter['participant'][0]['individual'][
                    'reference']
                if 'Practitioner' not in self.references.keys():
                    self.references['Practitioner'] = 1
                else:
                    self.references['Practitioner'] += 1

                location = encounter['location'][0]['location']['reference']
                if 'Location' not in self.references.keys():
                    self.references['Location'] = 1
                else:
                    self.references['Location'] += 1

                organization = encounter['serviceProvider']['reference']
                if 'Organization' not in self.references.keys():
                    self.references['Organization'] = 1
                else:
                    self.references['Organization'] += 1
def main():
    '''
    Run preprocessing
    '''
    # initialize argparser with a desc
    ap = argparse.ArgumentParser(
        description="Parallelized maximal preprocessing using stanza")

    # input path
    ap.add_argument("-p",
                    "--inpath",
                    required=True,
                    help="path to ndjson with texts to process")

    # output path
    ap.add_argument("-o",
                    "--outpath",
                    required=True,
                    help="where results will be saved")

    # language of texts
    ap.add_argument("--lang",
                    required=False,
                    type=str,
                    default='da',
                    help="two character ISO code of a desired language")
    # window
    ap.add_argument("--jobs",
                    required=False,
                    type=int,
                    default=4,
                    help="number of workers to split the job between.")

    # hotfix
    ap.add_argument("--bugstring",
                    required=False,
                    type=bool,
                    default=False,
                    help="remove seqences of equal signs from documents?")

    # parse that
    args = vars(ap.parse_args())

    # run functions down the line
    print('[info] Importing {}'.format(args['inpath']))
    with open(args['inpath'], 'r') as f_in:
        texts = ndjson.load(f_in)

    print('[info] Clearing buggy strings.')
    if args['bugstrings']:
        texts = _delete_many_equal_signs(texts)

    print('[info] Stanza starting {} jobs'.format(args['jobs']))
    dfs_out = stanza_multicore(texts=texts,
                               lang=args['lang'],
                               n_jobs_gpu=args['jobs'])

    print('[info] Saving results to {}'.format(args['outpath']))
    with open(args['outpath'], "w") as f_out:
        ndjson.dump(dfs_out, f_out)
예제 #18
0
def convert(compressedJSONFile, destDir=".", force = False, skipchecks=False):
    path = os.path.normpath(compressedJSONFile)
    fileName = path.split(os.sep)[-1]
    date = datetimeFromARDFilename(fileName)
    day = cd.dayFromDate(date)
    newFile =  destDir+"/NPGEO-RKI-{}.csv".format(cd.dateStrYMDFromDay(day))

    redo = False
    if not skipchecks:
        # check if previous file exist and make sure the current file is not broken
        previousFile =  destDir+"/NPGEO-RKI-{}.csv".format(cd.dateStrYMDFromDay(day-1))

        yesterDayRows = -1
        if os.path.isfile(previousFile):
            yesterdayFrame = dt.fread(previousFile)
            yesterDayRows = yesterdayFrame.nrows
        else:
            print("No file for previous day {}".format(day-1))

        allowedShrinkageDays = [33,68]
        allowedSameDays = [33]
        allowedJumpDays = [46,66]

        if not force and os.path.isfile(newFile) and yesterDayRows >= 0:
            existingFrame = dt.fread(newFile)
            existingRows = existingFrame.nrows
            if existingRows < yesterDayRows:
                if not day in allowedShrinkageDays:
                    print("Existing .csv file for day {} contains less rows ({}) than previous day file ({}), redoing".format(day,existingRows,yesterDayRows))
                    redo = True
                else:
                    print("On day {} the number of rows was reduced from {} to compared to yesterday's file ({})".format(day,existingRows,yesterDayRows))
            else:
                if existingRows == yesterDayRows:
                    if not day in allowedSameDays:
                        print("Existing .csv file for day {} contains same number of rows ({}) than previous day file ({}), redoing".format(day,existingRows,yesterDayRows))
                        redo = True
                    else:
                        print( "Existing .csv file for day {} contains same number of rows ({}) than previous day file ({}) but we can't do anything about it".format(
                                day, existingRows, yesterDayRows))
                elif (existingRows > yesterDayRows * 1.1) and (existingRows - yesterDayRows > 5000) and not day in allowedJumpDays:
                    print("Existing .csv file for day {} contains much more rows ({}) than previous day file ({}), redoing".format(day,existingRows,yesterDayRows))
                    redo = True

                print("Existing .csv file contains {} rows, {} more than yesterday".format(existingRows,existingRows-yesterDayRows))

    if force or redo or not os.path.isfile(newFile):
        print("Loading " + compressedJSONFile)
        #with bz2.open(compressedJSONFile, "rb") as f:
        with lzma.open(compressedJSONFile, "rb") as f:
            content = ndjson.load(f)
            frame = dt.Frame(content)
            if frame.nrows <= yesterDayRows and not day in allowedShrinkageDays:
                print("Rejecting '{}' because it contains less rows than yesterdays file".format(compressedJSONFile))
                return
            print("Saving " + newFile)
            frame.to_csv(newFile)
    else:
        print("Skipping '{}' because '{}' already exists".format(compressedJSONFile, newFile))
예제 #19
0
def load_ndjson(path: str) -> Union[List, Dict]:
    try:
        with open(path, "r") as read_file:
            data = ndjson.load(read_file)
    except ValueError as e:
        print("Invalid json: %s" % e)
        return None
    return data
예제 #20
0
def load_data(drawing_count):
    # load from file-like objects
    with open('house.ndjson') as f:
        drawings = ndjson.load(f)
        return [
            transform_drawing(drawing) for drawing in drawings[0:drawing_count]
            if len(drawing['drawing']) > 5
        ]
def concat_texts_timebins(lemma_path, metadata_path, outdir, timebin='10Min'):
    '''
    '''
    # load files
    with open(lemma_path) as fin:
        file_lemma = ndjson.load(fin)

    with open(metadata_path) as fin:
        file_meta = ndjson.load(fin)
        timestamps = [doc['start'] for doc in file_meta]
        del file_meta

    # resample
    df_resampled = (pd.DataFrame(
        file_lemma, index=pd.to_timedelta(timestamps)).resample(timebin).sum())

    # get rid of 0 (no document in time bin)
    df_resampled = (df_resampled.replace(0, np.nan).dropna())

    # get rid of [] (there is a document but no features in time bin)
    df_resampled['text'] = df_resampled.text[df_resampled.text.apply(len) > 0]
    df_resampled = (df_resampled.dropna().reset_index())

    # get timestamp as str
    df_resampled['time'] = df_resampled['index'].astype(str).str.extract(
        'days (.*?)\.')

    # serialize
    file_res = []
    for i, row in df_resampled.iterrows():
        res = dict()
        res.update({
            'time': row.time,
            'text': row.text,
            'lemma': row.lemma,
            'pos': row.pos,
            'dep': row.dep,
            'ner': row.ner
        })
        file_res.append(res)

    outfname = os.path.basename(lemma_path)
    with open(os.path.join(outdir, outfname), 'w') as fout:
        ndjson.dump(file_res, fout)

    return None
def load_dataset(f, label_vocab):
    with open(f) as i:
        d = ndjson.load(i)

    table = [(PROMPT + " " + i["text"], label_vocab[i["label"]]) for i in d]
    df = pd.DataFrame(table)
    df.columns = ["sentence", "label"]
    return df
예제 #23
0
    def process_kibana_object(self, obj_type, indexpattern=None):
        """
        Create json from ndjson kibana object to ease diff during commits
        """
        print("# Processing kibana object: %s" % obj_type)

        if obj_type != "index-pattern":
            src_file_name = "%s%s" % (EXPORT_FILES_PREFIX_KIBANA, obj_type)
        else:
            if indexpattern is None:
                for i in INDEX_PATTERNS_FILTER.split("|"):
                    self.process_kibana_object(obj_type, indexpattern=i)
                return
            else:
                src_file_name = "%s%s_%s" % (
                    EXPORT_FILES_PREFIX_KIBANA,
                    obj_type,
                    indexpattern,
                )

        src_file = os.path.join(self.export_path, "%s.ndjson" % src_file_name)
        diff_file = os.path.join(self.export_path, DIFF_PATH,
                                 "%s.json" % src_file_name)
        print("\tOpening %s: %s" % (obj_type, src_file))
        with open(src_file, "r") as src_ndjson_file:
            src_ndjson = ndjson.load(src_ndjson_file)

        for src_ndjson_line in src_ndjson:
            if obj_type == "index-pattern":
                src_ndjson_line["attributes"]["fields"] = sorted(
                    json.loads(src_ndjson_line["attributes"]["fields"]),
                    key=lambda x: x["name"],
                )
            elif obj_type == "search":
                src_ndjson_line["attributes"]["kibanaSavedObjectMeta"][
                    "searchSourceJSON"] = json.loads(
                        src_ndjson_line["attributes"]["kibanaSavedObjectMeta"]
                        ["searchSourceJSON"])
            elif obj_type == "visualization":
                src_ndjson_line["attributes"]["kibanaSavedObjectMeta"][
                    "searchSourceJSON"] = json.loads(
                        src_ndjson_line["attributes"]["kibanaSavedObjectMeta"]
                        ["searchSourceJSON"])
                src_ndjson_line["attributes"]["visState"] = json.loads(
                    src_ndjson_line["attributes"]["visState"])
            elif obj_type == "dashboard":
                src_ndjson_line["attributes"]["kibanaSavedObjectMeta"][
                    "searchSourceJSON"] = json.loads(
                        src_ndjson_line["attributes"]["kibanaSavedObjectMeta"]
                        ["searchSourceJSON"])
                src_ndjson_line["attributes"]["optionsJSON"] = json.loads(
                    src_ndjson_line["attributes"]["optionsJSON"])
                src_ndjson_line["attributes"]["panelsJSON"] = json.loads(
                    src_ndjson_line["attributes"]["panelsJSON"])

        print("\tWriting output to: %s" % diff_file)
        with open(diff_file, "w") as dst_json_file:
            json.dump(src_ndjson, dst_json_file, indent=4, sort_keys=True)
예제 #24
0
 def test_csv_to_compressed_ndjson(self):
     self.df_handler.convert_file(self.input_folder / 'invoice.csv',
                                  self.out_file_path,
                                  'csv',
                                  'ndjson',
                                  convert_options={'compression': 'gzip'})
     self.assertCountEqual(
         read_ndjson_gz_file(self.out_file_path),
         ndjson.load((self.input_folder / 'invoice.ndjson').open()))
def load_data(ndjson_path):
    '''
    Read a preprocessed file & convert to ttx format.
    '''
    with open(ndjson_path, 'r') as f:
        obj = ndjson.load(f)

    obj_dfs = [pd.DataFrame(dat) for dat in obj]

    return obj_dfs
예제 #26
0
def get_reviews(path):
    try:
        with open(path) as f:
            data_set = ndjson.load(f)
    except IOError:
        print('An error occurred trying to read the file.')
    all_reviews = []
    for data in data_set:
        all_reviews.append(data['text'])
    return all_reviews
예제 #27
0
def parse_dataset(path=RAW_DIR_NAME, decode=None, early_return=True):
    """
    Restructures dataset from '.ndjson' files into folders. Each folder will be
    of the form 'dataset/{LABEL}' and will contain 1 file per training example.
    Also saves the list of all filenames to 'filenames.txt'.

    @param path - str: path to directory containing dataset
    @param decode - None or "jpg" - how to decode training examples
    @param early_return - bool: indicates whether method should return early
        if 'filenames.txt' already exists

    @returns list containing all the filenames of the training examples 
        (relative to path)
    @returns list containing all the labels of the dataset
    """
    list_ids = []
    labels = set()

    # If the filenames.txt file already exists, parse the file to find
    # list_ids and labels, and return early
    if decode == 'jpg':
        list_ids_filename = os.path.join(path, '../img/' + 'filenames.txt')
    else:
        list_ids_filename = os.path.join(path, 'filenames.txt')
    if early_return and os.path.exists(list_ids_filename):
        with open(list_ids_filename) as f:
            list_ids = ndjson.load(f)
        for list_id in list_ids:
            label = os.path.basename(os.path.dirname(list_id))
            labels.add(label)
        return list_ids, list(labels)

    # Loop through all '.ndjson' files and split into individual files
    pool = mp.Pool(mp.cpu_count())
    files = os.listdir(path)
    files = [f for f in files if os.path.splitext(f)[1] == '.ndjson']
    list_ids_temp = []

    parse = functools.partial(parse_label, path=path, decode=decode)
    pool.map_async(parse, files, callback=list_ids_temp.extend)
    pool.close()
    pool.join()

    # Convert list_ids_temp from list of lists to just a list
    list_ids = []
    for list_id in list_ids_temp:
        list_ids += list_id

    # Write output to 'dataset/filename.txt' and find all labels
    with open(list_ids_filename, 'w') as f:
        ndjson.dump(list_ids, f)
    for list_id in list_ids:
        label = os.path.basename(os.path.dirname(list_id))
        labels.add(label)
    return list_ids, list(labels)
예제 #28
0
    def create_image_records(self):
        """Loop through the image files and write to tfrecords"""
        data_left = True
        passes_through_file = 5
        amount = 0
        while data_left:
            amount += 1
            writer = tf.python_io.TFRecordWriter(self.output_file +
                                                 '/record_' +
                                                 str(passes_through_file))

            for x, filename in enumerate(tqdm(
                    self.C.file_list)):  #For all the files names
                x += 1
                if x > 3:
                    break
                with open(filename) as f:
                    data = ndjson.load(f)

                    start_index = passes_through_file * self.images_per_class_per_tf_record
                    end_index = start_index + self.images_per_class_per_tf_record

                    if end_index > len(data):
                        end_index = len(data)

                    for j in range(
                            start_index, end_index
                    ):  #Go through the strokes and construct image from it
                        line = data[j]
                        if j < (passes_through_file *
                                self.images_per_class_per_tf_record
                                ) + self.images_per_class_per_tf_record:
                            img = self.draw_it(line).reshape(1, 255, 255)
                            img_raw = img.tostring()

                            feature = {
                                'class':
                                self._int64_feature(
                                    self.C.class_list.index(line['word'])),
                                'key':
                                self._bytes_feature(str.encode(
                                    line['key_id'])),
                                'image_raw':
                                self._bytes_feature(img_raw)
                            }

                            example = tf.train.Example(
                                features=tf.train.Features(feature=feature))
                            writer.write(example.SerializeToString())
                        else:
                            break
            passes_through_file += 1
            print("closing writer")
            writer.close()
            break  #only do one file for now
예제 #29
0
def import_normalize(doctop_path,
                     train_data_path,
                     meta_data_path,
                     datetime_col='time'):
    '''
    Import & normalize a document-topic matrix.

    So far, only the averaging method is implemented!
    '''
    # DOCTOP
    with open(doctop_path) as f:
        doctop = ndjson.load(f)
    # normalize
    norm_all = [[value / sum(doc) for value in doc] for doc in doctop]
    # to df
    norm_df = pd.DataFrame(norm_all)
    # test length
    assert norm_df.values.tolist() == norm_all

    # TRAIN DATA
    with open(train_data_path) as f:
        input_texts = ndjson.load(f)
    # iDs of documents used for training
    ids = [doc['id'] for doc in input_texts]

    # META DATA (dates)
    meta = pd.read_csv(meta_data_path, parse_dates=[datetime_col])
    # keep only docs used for training
    meta_trained = meta.iloc[ids, :]

    # AGGREGATE
    days = meta_trained['time'].dt.floor('d')  # parse_timebin_size
    norm_df.index = days.index
    norm_df['days'] = days
    topic_col_names = [col for col in norm_df.columns if col != 'days']
    # average
    avg_topic_df = norm_df.groupby('days')[topic_col_names].mean()
    doctop_avg = avg_topic_df.values.tolist()
    # normalize again
    doctop_avg = [[value / sum(doc) for value in doc] for doc in doctop_avg]

    return doctop_avg
예제 #30
0
    def test_encounter_process(self):
        encounter = FHIREncounterResourceManager()
        with open('test/encounter.ndjson') as f:
            encounter_data = ndjson.load(f)

        data = encounter.run_encounter_process(encounter_data[0])

        self.assertNotEqual(data['source_id'], None)
        self.assertNotEqual(data['patient'], None)
        self.assertNotEqual(data['start_date'], None)
        self.assertNotEqual(data['end_date'], None)