Пример #1
0
def join() -> None:
    d = dict()
    event_found = 0
    event_unknown_added = 0
    event_discarded = 0
    with TsvReader(ConfigJoin.hash_file) as hash_file_handle:
        if ConfigProgress.progress:
            hash_file_handle = tqdm.tqdm(hash_file_handle, desc="reading hash")
        for fields in hash_file_handle:
            key = fields[ConfigJoin.hash_key_column]
            value = fields[ConfigJoin.hash_value_column]
            d[key] = value
    with TsvReader(ConfigInputFile.input_file) as input_file_handle, \
            TsvWriter(ConfigOutputFile.output_file) as output_file_handle:
        if ConfigProgress.progress:
            input_file_handle = tqdm.tqdm(
                input_file_handle, desc="reading input and writing output")
        for fields in input_file_handle:
            key = fields[ConfigJoin.input_key_column]
            if key in d:
                event_found += 1
                new_value = d[key]
                fields.insert(ConfigJoin.output_insert_column, new_value)
                output_file_handle.write(fields)
            else:
                if ConfigJoin.output_add_unknown:
                    event_unknown_added += 1
                    fields.insert(ConfigJoin.output_insert_column, "unknown")
                    output_file_handle.write(fields)
                else:
                    event_discarded += 1
    print("event_found {}".format(event_found))
    print("event_unknown_added {}".format(event_unknown_added))
    print("event_discarded {}".format(event_discarded))
Пример #2
0
def majority() -> None:
    """
    This means that if x1 appears more
    with y2 than any other values in column Y then x1, y2 will be in the output
    and no other entry with x1 will appear
    """
    d: Dict[Dict[str, int]] = defaultdict(dict)
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        if ConfigProgress.progress:
            input_file_handle = tqdm.tqdm(input_file_handle)
        for fields in input_file_handle:
            p_first = fields[ConfigMajority.input_first_column]
            p_second = fields[ConfigMajority.input_second_column]
            p_multiplication = int(
                fields[ConfigMajority.input_multiplication_column])
            if p_second not in d[p_first]:
                d[p_first][p_second] = 0
            d[p_first][p_second] += p_multiplication
    with TsvWriter(
            filename=ConfigOutputFile.output_file) as output_file_handle:
        for p_first, p_dict in d.items():
            p_second = max(p_dict.keys(),
                           key=lambda x, closure_dict=p_dict: closure_dict[x])
            p_count = p_dict[p_second]
            output_file_handle.write([
                p_first,
                p_second,
                str(p_count),
            ])
Пример #3
0
def check() -> None:
    """
    TODO:
    - add ability to say how many lines are bad and print their content
    """
    if ConfigParallel.parallel:
        with concurrent.futures.ProcessPoolExecutor(
                max_workers=ConfigParallel.jobs) as executor:
            job_list = []
            for input_file in ConfigInputFiles.input_files:
                job = ParamsForJob()
                job.progress = ConfigProgress.progress
                job.check_non_ascii = ConfigTsvReader.check_non_ascii
                job.num_fields = ConfigNumFields.num_fields
                job.input_file = input_file
                job.validate_all_lines_same_number_of_fields = ConfigTsvReader.validate_all_lines_same_number_of_fields
                job_list.append(job)
            results = list(executor.map(check_file, job_list))
        print(results)
    for input_file in ConfigInputFiles.input_files:
        with TsvReader(
                filename=input_file,
                num_fields=ConfigNumFields.num_fields,
                validate_all_lines_same_number_of_fields=ConfigTsvReader.
                validate_all_lines_same_number_of_fields,
                check_non_ascii=ConfigTsvReader.check_non_ascii,
        ) as input_file_handle:
            if ConfigProgress.progress:
                input_file_handle = tqdm.tqdm(input_file_handle,
                                              desc=input_file)
            for _ in input_file_handle:
                pass
Пример #4
0
def read() -> None:
    for input_file in ConfigInputFiles.input_files:
        with TsvReader(filename=input_file) as input_file_handle:
            if ConfigProgress.progress:
                input_file_handle = tqdm.tqdm(input_file_handle,
                                              desc=input_file)
            for _ in input_file_handle:
                pass
Пример #5
0
def sum_columns() -> None:
    sums = [0] * len(ConfigColumns.columns)
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        if ConfigProgress.progress:
            input_file_handle = tqdm.tqdm(input_file_handle)
        for fields in input_file_handle:
            for n, i in enumerate(ConfigColumns.columns):
                sums[n] += float(fields[i])
    print(sums)
Пример #6
0
def cut() -> None:
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            if ConfigProgress.progress:
                input_file_handle = tqdm.tqdm(input_file_handle)
            for fields in input_file_handle:
                out_fields = [fields[x] for x in ConfigColumns.columns]
                output_file_handle.write(out_fields)
Пример #7
0
def multiply() -> None:
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            if ConfigProgress.progress:
                input_file_handle = tqdm.tqdm(input_file_handle)
            for fields in input_file_handle:
                current_number = int(fields[ConfigColumn.column])
                for _ in range(current_number):
                    output_file_handle.write(fields)
Пример #8
0
def lc() -> None:
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            if ConfigProgress.progress:
                input_file_handle = tqdm.tqdm(input_file_handle)
            for fields in input_file_handle:
                for i in ConfigColumns.columns:
                    fields[i] = fields[i].lower()
                output_file_handle.write(fields)
Пример #9
0
def remove_quotes() -> None:
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            if ConfigProgress.progress:
                input_file_handle = tqdm.tqdm(input_file_handle)
            for fields in input_file_handle:
                for i in ConfigColumns.columns:
                    if fields[i].startswith("\"") and fields[i].endswith(
                            "\"") and len(fields[i]) > 1:
                        fields[i] = fields[i][1:-1]
                output_file_handle.write(fields)
Пример #10
0
def clean_by_field_num() -> None:
    with TsvReader(filename=ConfigInputFile.input_file,
                   validate_all_lines_same_number_of_fields=False
                   ) as input_file_handle:
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            if ConfigProgress.progress:
                input_file_handle = tqdm.tqdm(input_file_handle,
                                              desc=ConfigInputFile.input_file)
            for fields in input_file_handle:
                if len(fields) == ConfigColumns.columns:
                    output_file_handle.write(fields)
Пример #11
0
def check_file(params_for_job: ParamsForJob) -> bool:
    print('checking [{}]...'.format(params_for_job.input_file))
    with TsvReader(filename=params_for_job.input_file,
                   num_fields=params_for_job.num_fields,
                   validate_all_lines_same_number_of_fields=params_for_job.
                   validate_all_lines_same_number_of_fields,
                   check_non_ascii=params_for_job.check_non_ascii
                   ) as input_file_handle:
        if params_for_job.progress:
            input_file_handle = tqdm.tqdm(input_file_handle)
        for _ in input_file_handle:
            pass
    return True
Пример #12
0
def drop_duplicates_by_columns() -> None:
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        if ConfigProgress.progress:
            input_file_handle = tqdm.tqdm(input_file_handle)
        saw = set()
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            for fields in input_file_handle:
                match = frozenset([
                    fields[match_column]
                    for match_column in ConfigColumns.columns
                ])
                if match not in saw:
                    saw.add(match)
                    output_file_handle.write(fields)
Пример #13
0
def tree() -> None:
    """
    You can also see only parts of the tree
    """
    children_dict: Dict[Set] = defaultdict(set)
    parents_dict = defaultdict(set)
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        for fields in input_file_handle:
            p_parent = fields[ConfigTree.parent_column]
            p_child = fields[ConfigTree.child_column]
            children_dict[p_parent].add(p_child)
            parents_dict[p_child].add(p_parent)
    # find the roots (parents that have no parents)
    if ConfigTree.roots:
        list_of_roots = ConfigTree.roots
    else:
        list_of_roots = []
        for p_parent in children_dict.keys():
            if len(parents_dict[p_parent]) == 0:
                list_of_roots.append(p_parent)

    list_to_append = []
    first = True
    for root in list_of_roots:
        list_to_append.append((root, 0, first, ""))
        first = False

    stack = []
    stack.extend(list_to_append)
    # lets draw the tree
    while len(stack) > 0:
        name, depth, last, print_list = stack.pop()
        if last:
            special_string = u"└──"
        else:
            special_string = u"├──"
        print("{}{}".format(print_list + special_string, name))
        first = True
        list_to_append = []
        for p_child in children_dict[name]:
            if last:
                special_string = "   "
            else:
                special_string = u"│  "
            list_to_append.append(
                (p_child, depth + 1, first, print_list + special_string))
            first = False
        stack.extend(list(reversed(list_to_append)))
Пример #14
0
def sample_by_column_old() -> None:
    weights = []
    elements = []
    sum_weights = float(0)
    with TsvReader(ConfigInputFile.input_file) as input_handle:
        if ConfigProgress.progress:
            input_handle = tqdm.tqdm(input_handle)
        for fields in input_handle:
            elements.append(fields)
            weight = float(fields[ConfigSampleColumn.sample_column])
            sum_weights += weight
            weights.append(weight)
    # the following code will only work on python3.6 because the
    # random.choices API was only introduced then
    # from random import choices
    # results = choices(lines, weights, k=size)

    # this is the same code with numpy
    weights = [w / sum_weights for w in weights]
    if ConfigSampleByColumnOld.hits_mode:
        results_dict = defaultdict(int)
        for _ in range(ConfigSampleSize.size):
            current_result = numpy.random.choice(
                a=len(elements),
                replace=ConfigReplace.replace,
                size=1,
                p=weights,
            )
            current_result = current_result[0]
            results_dict[current_result] += 1
        with TsvWriter(ConfigOutputFile.output_file) as output_handle:
            for result, hits in results_dict.items():
                record = list(elements[result])
                record.append(hits)
                output_handle.write(record)
    else:
        results = numpy.random.choice(
            a=len(elements),
            replace=ConfigReplace.replace,
            size=ConfigSampleSize.size,
            p=weights,
        )
        with TsvWriter(ConfigOutputFile.output_file) as output_handle:
            for result in results:
                output_handle.write(elements[result])
Пример #15
0
def fix_columns() -> None:
    # We need to read the input file WITHOUT assuming that it hasn't problems
    with TsvReader(
            filename=ConfigInputFile.input_file,
            check_non_ascii=ConfigTsvReader.check_non_ascii,
    ) as input_file_handle:
        if ConfigProgress.progress:
            input_file_handle = tqdm.tqdm(input_file_handle)
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            for fields in input_file_handle:
                for fix_column in ConfigColumns.columns:
                    fields[fix_column] = clean(
                        text=fields[fix_column],
                        clean_edges=ConfigFixTypes.clean_edges,
                        sub_trailing=ConfigFixTypes.sub_trailing,
                        remove_non_ascii=ConfigFixTypes.remove_non_ascii,
                        lower_case=ConfigFixTypes.lower_case,
                    )
                output_file_handle.write(fields)
Пример #16
0
def histogram_by_column() -> None:
    a = []
    total = 0
    with TsvReader(ConfigInputFile.input_file) as input_handle:
        for fields in input_handle:
            a.append(float(fields[ConfigColumn.column]))
            total += 1
    count_in_bucket, bucket_edges = numpy.histogram(
        a, bins=ConfigBucketNumber.bucket_number)
    with TsvWriter(ConfigOutputFile.output_file) as output_handle:
        current_sum = 0
        for i, count in enumerate(count_in_bucket):
            current_sum += count
            edge_from = bucket_edges[i]
            edge_to = bucket_edges[i + 1]
            output_handle.write([
                str(edge_from),
                str(edge_to),
                str(count),
                str(int(100.0 * current_sum / total)),
            ])
Пример #17
0
def process_single_file(job_info: JobInfo) -> JobReturnValue:
    logger = logging.getLogger(__name__)
    tsv_writers_dict = dict()
    results = dict()
    with TsvReader(
            filename=job_info.input_file,
            check_non_ascii=job_info.check_not_ascii) as input_file_handle:
        if job_info.progress:
            logger.info("working on [{job_info.input_file}]")
            input_file_handle = tqdm.tqdm(input_file_handle)
        for fields in input_file_handle:
            key = ",".join([fields[x] for x in job_info.columns])
            if key not in tsv_writers_dict:
                filename = job_info.pattern.format(key=key, i=job_info.serial)
                results[key] = filename
                output_handle = TsvWriter(filename=filename)
                tsv_writers_dict[key] = output_handle
            output_handle = tsv_writers_dict[key]
            output_handle.write(fields)
    for v in tsv_writers_dict.values():
        v.close()
    return JobReturnValue(job_info.serial, results)
Пример #18
0
def split_by_columns() -> None:
    pylogconf.core.setup()
    logger = logging.getLogger(__name__)
    assert len(ConfigColumns.columns) > 0, "must provide --columns"
    tsv_writers_dict = dict()
    for input_file in ConfigInputFiles.input_files:
        with TsvReader(filename=input_file,
                       check_non_ascii=ConfigTsvReader.check_non_ascii
                       ) as input_file_handle:
            if ConfigProgress.progress:
                logger.info(f"working on [{input_file}]")
                input_file_handle = tqdm.tqdm(input_file_handle)
            for fields in input_file_handle:
                key = ",".join([fields[x] for x in ConfigColumns.columns])
                if key not in tsv_writers_dict:
                    filename = ConfigPattern.pattern.format(key=key)
                    output_handle = TsvWriter(filename=filename)
                    tsv_writers_dict[key] = output_handle
                output_handle = tsv_writers_dict[key]
                output_handle.write(fields)
    # close all writers
    for v in tsv_writers_dict.values():
        v.close()
Пример #19
0
def check_columns_unique() -> None:
    dicts = [dict() for _ in range(len(ConfigColumns.columns))]
    errors = False
    for input_file in ConfigInputFiles.input_files:
        with TsvReader(filename=input_file, ) as input_file_handle:
            if ConfigProgress.progress:
                input_file_handle = tqdm.tqdm(input_file_handle,
                                              desc=input_file)
            for line_number, fields in enumerate(input_file_handle):
                for i, column in enumerate(ConfigColumns.columns):
                    value = fields[column]
                    if value in dicts[i]:
                        line = dicts[i][value]
                        print(
                            "value [{}] is duplicate on lines [{}, {}]".format(
                                value,
                                line,
                                line_number,
                            ))
                        errors = True
                    else:
                        dicts[i][value] = line_number
    assert errors is False, "found errors"
Пример #20
0
def read_all_file(filename: str) -> None:
    with TsvReader(filename=filename) as input_handle:
        for _ in input_handle:
            pass
Пример #21
0
 def testGoodFile(self):
     g = TsvReader(filename=file_good)
     g.close()
Пример #22
0
def tsv_to_csv() -> None:
    with open(ConfigOutputFile.output_file, "wt") as output_file_handle:
        csv_writer = csv.writer(output_file_handle)
        with TsvReader(ConfigInputFile.input_file) as input_file_handle:
            for fields in input_file_handle:
                csv_writer.writerow(fields)