def sample_distinct_lines_from_file(path, total_num_lines=None, sample_size=100, skip_first_nlines=1, encoding='utf-8', random_seed=42): """Quickly sample distinct lines from a text file. If the sample_size is close to the line count of the file, the entire file will be read. If the file contains many duplicate lines, the returned sample may contain less lines than specified by the sample_size parameter. The algorithm has an element of randomness, so if you are lucky (or unlucky) enough, the returned sample may still contain less lines than specified by the sample_size parameter. For a detailed explanation of the algorithm, see the third algorithm on http://metadatascience.com/2014/02/27/random-sampling-from-very-large-files/. Returns: list: List of lines sampled. """ is_file = check_is_file(path) if is_file['result']: sample = {} file_size = os.path.getsize(path) if not total_num_lines: total_num_lines_res = approximate_record_number(path, encoding=encoding) if isinstance(total_num_lines_res['result'], int): # successfully estimated line count of the file total_num_lines = total_num_lines_res['result'] else: return total_num_lines_res with open(path, 'rb') as f: if skip_first_nlines and isinstance(skip_first_nlines, int): first_nlines_to_skip = set(f.readline() for _ in range(skip_first_nlines)) else: first_nlines_to_skip = set() # if sample size is almost the same as the line count, will simply read the whole file if total_num_lines / sample_size < 1.2: lines = [line.decode(encoding) for line in f.readlines() if line not in first_nlines_to_skip] if len(lines) >= sample_size: return generate_response(result=random.sample(lines, sample_size)) else: return generate_response(result=lines, warning='Sample size is larger than the line count, so the entire file is returned') else: random.seed(random_seed) sampling_times = 0 while len(sample) < sample_size and sampling_times <= sample_size*2: pos = random.randrange(file_size) f.seek(pos) f.readline() # skip a broken line line = f.readline() if line in first_nlines_to_skip: continue sample.setdefault(line, pos) sampling_times += 1 sample = sorted(sample.items(), key=lambda x:x[1]) sample = [line.decode(encoding) for (line, pos) in sample][:sample_size] if len(sample) < sample_size: warning = 'The program has sampled %i times and still not been able to sample %i distinct lines.\nPossible reasons are 1. there are many duplicate lines in the file,\n2. there is a lurking bug in the code,\nor 3. you are just extremely unlucky.' % (sample_size*2, sample_size) return generate_response(result=sample, warning=warning) return generate_response(result=sample) else: return is_file
def approximate_record_number(file_path, encoding=None): """Get an estimation of number of data records of a text file or a database Args: encoding: Encoding of the file. If set to falsy values, a Python library called `chardet` will be used to detect the encoding. Returns: int: Approximate number of lines in the file. 0 if the file is empty. """ NUM_TEST_LINES = 1000 is_file = check_is_file(file_path) if is_file['result']: raw_text = [] with open(file_path, 'rb') as f: # read the first 1000 lines for _ in range(NUM_TEST_LINES): try: raw_text.append(next(f)) except StopIteration: break NUM_TEST_LINES = len(raw_text) raw_text = b''.join(raw_text) if raw_text: if not encoding: encoding = chardet.detect(raw_text)['encoding'] raw_text_size = len(raw_text.decode(encoding)) file_size = check_file_size(file_path)['result'] if raw_text_size: return generate_response(result=int(file_size / raw_text_size * NUM_TEST_LINES)) else: # raw_text_size is zero return generate_response(warning='An error occured and the program faild to obtain an estimation of the line count.') else: # empty file return generate_response(result=0, warning='The file is empty') else: # not a file return is_file
def parse_json_format(file_path=None): """Detect whether a json file is a standard json or jsonl(json lines) Retures: str: json or jsonl """ is_file_res = check_is_file(file_path) if is_file_res['result']: with open(file_path) as f: if f.readline().strip().startswith('['): return generate_response(result='jsonl') return generate_response(result='json') else: return is_file_res
def parse_data_format_from_path(path): """Extract data format from path. Args: path(str): File path or database URI. Returns: str: Parsed data format """ data_format = None if os.path.isfile(path): data_format = os.path.splitext(path)[1].lower() else: # TODO: Add support for more data formats pass if data_format is None: return generate_response(warning='The path %s is not understandable.' % path) # elif data_format not in supported_formats: # return generate_response(warning='The data format %s is currently not supported.' % data_format) return generate_response(result=data_format)
def _load_csv(self, path, encoding=None, sample_size=None, sample_method='random', sep=',', header_line=0, record_num=None, **kwargs): """Loat a csv/tsv file into dataframe. If the file is too large for memory, sample_size argument could be set to read only certain number of lines. Args: header_line: Indicate whether first line is header. Legitimate values are 0, True, False. 0 and True mean the first line is header and False means otherwise. Hierarchical index is not currently supported. sample_size(int): Select certain number of rows to return. Defaults to None and if set to 0 or None or other falsy values, the entire dataframe read from the path will be returned in regardless of the size of the file. sample_method(str): Select rows randomly if set to 'random'. Legitimate values are 'random', 'first'. """ if sample_method and sample_method.lower() not in ('random', 'first', 'all'): return generate_response( warning='Received invalid value %s for parameter sample_method' % str(sample_method)) if not sample_size or ( sample_method and sample_method == 'all' ): # return the entire dataframe in regardless of the size of the file return generate_response( result=pd.read_csv(filepath_or_buffer=path, encoding=encoding, sep=sep, header=header_line, **kwargs)) elif sample_method == 'first': return generate_response( result=pd.read_csv(filepath_or_buffer=path, nrows=int(sample_size), encoding=encoding, sep=sep, **kwargs)) else: # randomly select sample_size skip_first_nlines = 0 # number of rows at the beginning to skip header = None # header line if 'skiprows' in kwargs: skip_first_nlines += int(kwargs['skiprows']) del kwargs['skiprows'] if header_line == 0 or header_line == 'infer': with open(path, encoding=encoding) as f: header = f.readline() skip_first_nlines += 1 sampled_lines_res = utilities.sample_distinct_lines_from_file( path=path, total_num_lines=record_num, sample_size=sample_size, skip_first_nlines=skip_first_nlines, encoding=encoding) sampled_lines = sampled_lines_res['result'] sampled_lines_warning = sampled_lines_res['warning'] if sampled_lines: warning = '' if len(sampled_lines_res ) == sample_size else sampled_lines_warning if header: sampled_lines.insert(0, header) sampled_lines_df = pd.read_csv(io.StringIO( ''.join(sampled_lines)), sep=sep, **kwargs) return generate_response(result=sampled_lines_df, warning=warning) else: return sampled_lines_res
def check_is_file(file_path): """Check wheter the value of file_path argument is a valid file path""" if not os.path.isfile(file_path): return generate_response(result=False, warning='File not found. Please check the file path passed in.') return generate_response(result=True)
def check_file_size(file_path): if check_is_file(file_path)['result']: size = os.path.getsize(file_path) return generate_response(result=size)
def check_encoding(file_path_or_text=None): if check_is_file(file_path_or_text)['result']: with open(file_path_or_text,'rb') as f: file_path_or_text = b''.join(f.readline() for _ in range(100)) return generate_response(result=chardet.detect(file_path_or_text)['encoding'])