Пример #1
0
def extract_comparisons_from_file(filename):
    comparisons = []
    total = 0

    with open(filename) as f:
        data = f.readlines()
        sequences = []
        for row in data:
            if row[:4] == "lcl|":
                sequences.append({
                    "id":
                    total,
                    "values": [value.strip() for value in row.split(" ")]
                })
                total += 1

        log.datetime_log("Starting process for filename {}".format(filename))

        with Pool(processes=10) as pool:
            comparisons = pool.map(partial(get_relevant_data, total=total),
                                   sequences)

        log.datetime_log("Finishing process for filename {}".format(filename))

    return comparisons
Пример #2
0
def get_relevant_data(values, total):
    count = values["id"]
    values = values["values"]
    taxid = get_taxid_from_sequence(values[2])

    organism_result = get_taxonomy_from_taxid(taxid)

    i = len(values) - 1
    cont = 0
    score = 0
    while True:
        try:
            num = float(values[i])
            cont += 1
            if cont == 2:
                score = num
                break
        except ValueError:
            pass
        i -= 1

    count += 1

    organism_result["SCORE"] = num
    log.datetime_log(
        "Classified sequence with id.{} out of {} sequences.".format(
            count, total))

    return organism_result
Пример #3
0
def post_prune_trees():

    output = {}

    try:
        data = request.get_json()
        merged_tree = json.loads(data['mergedTree'])
        threshold = float(data['threshold'])

        sequences = list(merged_tree['SCORE'].keys())
        saved_sequences, rest = utils.get_unsaved_sequences(sequences)

        pruned_sequences = []

        for sequence in saved_sequences:
            pruned_sequence = {}
            pruned_sequence['sequence_id'] = sequence['sequence_id']
            pruned_sequence['hierarchy'] = utils.prune_tree(
                threshold, sequence['hierarchy'])
            pruned_sequences.append(pruned_sequence)

        pruned_tree = utils.prune_tree(threshold, merged_tree)

        output['pruned_sequences'] = pruned_sequences
        output['pruned_tree'] = pruned_tree

        return jsonify(output)

    except Exception as e:
        output["Error"] = str(e)
        log.datetime_log("Error: {}".format(e))
        return jsonify(output)
Пример #4
0
def get_taxonomy_from_taxid(taxid):
    taxonomy_dict = NONE_RANK
    try:
        rank, tax_name, parent_taxid = get_rank_from_taxid(taxid)
        while parent_taxid != 1:
            if rank != "NO RANK":
                taxonomy_dict[rank] = tax_name

            rank, tax_name, parent_taxid = get_rank_from_taxid(parent_taxid)

        # Check if it has the minimum rankings
        for min_rank in MINIMUM_RANKS:
            if not min_rank in taxonomy_dict.keys():
                possible_ranks = [
                    rank for rank in taxonomy_dict.keys() if min_rank in rank
                ]

                if len(possible_ranks) > 0:
                    taxonomy_dict[min_rank] = taxonomy_dict[possible_ranks[0]]

                else:
                    taxonomy_dict[min_rank] = "undefined"

        return taxonomy_dict
    except:
        log.datetime_log("Not able to find rank of taxid {}".format(taxid))
        return NONE_RANK
Пример #5
0
def parse_names_file(filename, **kargs):

    batch_size = 1
    processes = 1
    if "batch_size" in kargs:
        batch_size = kargs["batch_size"]

    if "processes" in kargs:
        processes = kargs["processes"]

    with open(filename, "r") as f:
        for i, data in enumerate(
                iter(lambda: tuple(islice(f, batch_size)), ())):
            log.datetime_log("Started batch no. {} name parser".format(i))
            with Pool(processes=processes) as pool: 
                filtered_data = [get_new_values(row) for row in data if "name" in get_new_values(row)]
                pool.map( update_one, filtered_data)
Пример #6
0
def remove_files(folder, processed_file):

    with open(processed_file, 'r') as existing_files:
        count_removed = 0
        with open(INPUT_FILE, 'w') as input_file:
            for line in existing_files:
                info_existing = line.split(":")
                existing_filename = info_existing[2].replace('"', "").replace(
                    "{", "").replace("}", "").strip(" \t\n\r")
                input_file.write("{},\n".format(existing_filename))
                existing_filename += ".out.txt"
                existing_path = os.path.join(folder, existing_filename)
                if os.path.isfile(existing_path):
                    count_removed += 1
                    os.remove(existing_path)

        log.datetime_log("Removed {} files out of {} saved models".format(
            count_removed, len(existing_files.readlines())))
Пример #7
0
def post_prune_single_tree():

    output = {}

    try:
        data = request.get_json()
        tree = json.loads(data['tree'])
        threshold = float(data['threshold'])

        pruned_tree = utils.prune_tree(threshold, tree)

        output['pruned_tree'] = pruned_tree

        return jsonify(output)

    except Exception as e:
        output["Error"] = str(e)
        log.datetime_log("Error: {}".format(e))
        return jsonify(output)
def parse_sequences(TMP_FOLDER, filename, **kargs):

    batch_size = 1
    processes = 1
    if "batch_size" in kargs:
        batch_size = kargs["batch_size"]

    if "processes" in kargs:
        processes = kargs["processes"]

    with open(filename, "r") as f:
        output_paths = []
        for i, data in enumerate(iter(lambda: tuple(islice(f, batch_size)),
                                      ())):
            log.datetime_log("Started batch no. {} name parser".format(i))
            with Pool(processes=processes) as pool:
                filtered_data = [row for row in data if not row[:1] == ">"]
                output_paths.extend(
                    pool.map(partial(compare_sequence, TMP_FOLDER=TMP_FOLDER),
                             filtered_data))
Пример #9
0
def generate_and_update_hierarchies(row):
    sequence_id = row.strip(" \t\n\r").replace(",", "")

    with MongoClient() as client:
        db = client.biovis
        db_models = db.models

        search = {"sequence_id": sequence_id}

        saved = db_models.find_one(search)

        if (saved is not None and saved["comparisons"] is not None):

            tmp_tree, tmp_hierarchy = utils.get_hierarchy_from_dict(
                sequence_id, saved["comparisons"])

            update = {"hierarchy": tmp_hierarchy, "tree": tmp_tree}

            db_models.update_one(search, {"$set": update})
            log.datetime_log("Updated document {}".format(sequence_id))
Пример #10
0
def post_compare_sequence():
    output = {}

    try:

        merged_tree = {'name': '', 'children': {}, 'SCORE': []}

        data = request.get_json()

        if not "batch_size" in data:
            data["batch_size"] = 1

        data["sequences"] = [
            sequence.strip(" \t\n\r") for sequence in data["sequences"]
        ]

        # Detect sequences processed before
        saved_sequences, tmp_sequences = utils.get_unsaved_sequences(
            data["sequences"])

        # Include previously saved sequences
        processed_batch = saved_sequences.copy()

        for saved_sequence in processed_batch:
            utils.get_hierarchy_from_dict(saved_sequence['sequence_id'],
                                          saved_sequence['comparisons'],
                                          target=merged_tree)

        counter = 0
        current_batch_stop = counter
        pieces_left = len(tmp_sequences) > 0

        while pieces_left:

            tmp_sequences = tmp_sequences[current_batch_stop:]

            num_sequences_left = len(tmp_sequences)

            if data["batch_size"] < num_sequences_left:
                current_batch_stop = data["batch_size"]

            else:
                current_batch_stop = num_sequences_left
                pieces_left = False

            # Compare unprocessed sequences
            file_batch = [
                utils.compare_sequence(sequence) for sequence in tmp_sequences
            ]

            counter += data["batch_size"]
            log.datetime_log("{} sequences compared.".format(counter))

            # Generate tree for unprocessed sequences
            merged_tree, unsaved_batch = utils.process_batch(
                tmp_sequences, file_batch, merged_tree)

            processed_batch.extend(unsaved_batch)

        # Prepare output
        hierarchy, aggregated_score = utils.form_hierarchy(merged_tree)
        output["merged_tree"] = hierarchy

        output["taxonomies_batch"] = processed_batch

        log.datetime_log("{} hierarchies formed.".format(counter))

        return jsonify(output)

    except Exception as e:
        output["Error"] = str(e)
        log.datetime_log("Error: {}".format(e))
        return jsonify(output)
Пример #11
0
def upload_file():

    output = {}

    try:
        data = request.get_json()

        if data["file"] is not None and data["filename"] is not None:

            taxonomy = []
            parsed_filename = data["filename"].split(".")[0]
            merged_tree = {'name': '', 'children': {}, 'SCORE': []}

            try:
                file_path = utils.try_to_save_file(data["file"],
                                                   data["filename"])

                log.datetime_log("Succeded saving file.")

                merged_tree, taxonomy = utils.process_batch([parsed_filename],
                                                            [file_path],
                                                            merged_tree)

            except utils.FileExists as e:
                taxonomy, tmp_sequences = utils.get_unsaved_sequences(
                    [parsed_filename])

                if len(taxonomy) == 0:
                    sequence_id = utils.get_sequence_id(data["filename"])
                    if sequence_id is not None:
                        log.datetime_log(
                            "File existed and sequence {} parsed succesfully.".
                            format(sequence_id))
                        taxonomy, tmp_sequences = utils.get_unsaved_sequences(
                            [sequence_id])

                if len(taxonomy) > 0:
                    utils.get_hierarchy_from_dict(taxonomy[0]['sequence_id'],
                                                  taxonomy[0]['comparisons'],
                                                  target=merged_tree)

                else:
                    log.datetime_log(
                        "File existed but sequence not parsed: trying to write a new file."
                    )
                    file_path = ""
                    cont = 0
                    while len(file_path) == 0 and cont < 50:
                        try:
                            file_path = utils.try_to_save_file(
                                data["file"], data["filename"], modifier=cont)

                        except utils.FileExists as e:
                            cont += 1

                    log.datetime_log(
                        "File succesfully saved at {}.".format(file_path))

                    merged_tree, taxonomy = utils.process_batch(
                        [parsed_filename], [file_path], merged_tree)

            # Prepare output
            print("merged_tree")
            print(merged_tree)
            hierarchy, aggregated_score = utils.form_hierarchy(merged_tree)
            print("hierarchy")
            print(hierarchy)
            output["merged_tree"] = hierarchy['children'][0]

            output["taxonomies_batch"] = taxonomy
            return jsonify(output)

    except Exception as e:
        output["Error"] = str(e)
        log.datetime_log("Error: {}".format(e))
        return jsonify(output)
Пример #12
0
def read_in_chunks(file, batch_size):
    log.datetime_log("Started batch node parser")
    data = list(map(lambda x: parse_tax(x), islice(file, batch_size)))
    yield data