示例#1
0
def main():
    args = get_arguments()
    
    initial_file_list = []
    process_folder(args.in_folder, initial_file_list)
    if len(initial_file_list) == 0:
        print("Did not find a valid input file in " + args.in_folder)
        exit()

    if len(initial_file_list) == 1:
        initial_file_list.append(initial_file_list[0])
    else:
        initial_file_list = sorted(initial_file_list)

    inputA = initial_file_list[0]
    inputB = initial_file_list[len(initial_file_list) - 1]

    initial_smiles_strings = []
    initial_smiles_strings.append(str(obj_tools.obj2string(inputA)))
    initial_smiles_strings.append(str(obj_tools.obj2string(inputB)))

    tile_grammar = grammar.TilingGrammar(initial_smiles_strings)
    print("max # neighbors: " + str(tile_grammar.max_degree()))
    tile_grammar.store(args.out_grammarpath)

    if args.fix_variations:
        print("fixing variations...")
        fix_variations(args.in_folder, [],  inputA, inputB)

    print("removing duplicates...")
    remove_duplicates(tile_grammar, args.in_folder, inputA, inputB, initial_smiles_strings)

    smiles_strings = []
    for i in range(args.num_iterations):
        current_file_list = []
        process_folder(args.in_folder, current_file_list)
        print("Current # of variations: " + str(len(current_file_list)))
        if len(current_file_list) == 1:
            current_file_list.append(current_file_list[0])    
        augment_folder(current_file_list, smiles_strings)
        smiles_strings = list(set(smiles_strings))
        if args.fix_variations:
            print("fixing variations...")
            fix_variations(args.in_folder, current_file_list,  inputA, inputB)
        print("removing duplicates...")
        remove_duplicates(tile_grammar, args.in_folder, inputA, inputB, initial_smiles_strings)
        print("Iteration " + str(i) + " # of strings: " + str(len(smiles_strings)))

    loaded_grammar = grammar.TilingGrammar([])
    loaded_grammar.load(args.out_grammarpath)
    
    valid_strings = []
    for w in smiles_strings:
        if(loaded_grammar.check_word(w) == True):
            if len(str(w)) > 0 :
                valid_strings.append(w)      

    print("# valid strings: " + str(len(valid_strings)))
    df = pandas.DataFrame({args.smiles_column : valid_strings})
    df.to_hdf(args.out_filepath, "table", format = "table", data_columns = True)
示例#2
0
def remove_duplicates(tile_grammar, folder_name, inputA, inputB, word_list = []):

    current_words = []
    for old_str in word_list:
        current_words.append(old_str)

    for item_name in os.listdir(folder_name):
        subfolfer_name = os.path.join(folder_name, item_name)
        if os.path.isdir(subfolfer_name):
            remove_duplicates(tile_grammar, subfolfer_name, inputA, inputB, word_list)
        file_path = folder_name + "/" + item_name
        if  file_path != inputA and file_path != inputB and not item_name.endswith("_coll_graph.obj") and item_name.endswith(".obj"):
            current_str = obj_tools.obj2string(file_path)
            base_path, extension = os.path.splitext(file_path)
            os.remove(base_path + "_coll_graph.obj")            
            os.remove(base_path + "_coll_graph.mtl")

            if len(current_str) > 8 * MAX_WORD_LENGTH or not tile_grammar.check_word(current_str):
                os.remove(file_path)
                os.remove(base_path + ".mtl")
                continue

            current_words.append(current_str)
            for i in range(len(current_words) - 1):
                if tile_grammar.similar_words(current_words[i], current_str):
                    os.remove(file_path)
                    os.remove(base_path + ".mtl")
                    current_words.pop()
                    break
示例#3
0
def main():
    args = get_arguments()

    file_list = []
    process_folder(args.in_folder, file_list)

    inputA = file_list[0]
    inputB = file_list[len(file_list) - 1]

    initial_smiles_strings = []
    initial_smiles_strings.append(str(obj_tools.obj2string(inputA)))
    initial_smiles_strings.append(str(obj_tools.obj2string(inputB)))

    tile_grammar = TilingGrammar([])
    if os.path.isfile(args.grammar):
        tile_grammar.load(args.grammar)
    else:
        raise ValueError("Grammar file %s doesn't exist" % args.grammar)

    if TREE_GRAMMAR:
        tile_grammar.convert_to_tree_grammar()

    cluster_centers, node_types = shape_graph.categorize_edges(
        file_list[:100], tile_grammar)

    all_edge_categories_a, all_edges_a = file_to_graph_with_categories(
        inputA, cluster_centers, tile_grammar)

    output_str_a = ""
    for edge in all_edges_a:
        output_str_a += str(edge[0]) + " "
    output_str_a += "\n"
    for edge in all_edges_a:
        output_str_a += str(edge[1]) + " "
    output_str_a += "\n"
    for categ in all_edge_categories_a:
        output_str_a += str(categ) + " "
    output_str_a += "\n"

    category_pairs = set()
    for edge, cat in zip(all_edges_a, all_edge_categories_a):
        reverse_edge = [edge[1], edge[0]]
        reverse_cat = all_edge_categories_a[all_edges_a.index(reverse_edge)]
        category_pairs.add((cat, reverse_cat))

    all_edge_categories_b, all_edges_b = file_to_graph_with_categories(
        inputB, cluster_centers, tile_grammar)

    output_str_b = ""
    for edge in all_edges_b:
        output_str_b += str(edge[0]) + " "
    output_str_b += "\n"
    for edge in all_edges_b:
        output_str_b += str(edge[1]) + " "
    output_str_b += "\n"
    for categ in all_edge_categories_b:
        output_str_b += str(categ) + " "
    output_str_b += "\n"

    for edge, cat in zip(all_edges_b, all_edge_categories_b):
        reverse_edge = [edge[1], edge[0]]
        reverse_cat = all_edge_categories_b[all_edges_b.index(reverse_edge)]
        category_pairs.add((cat, reverse_cat))

    data_train, categories_train, masks_train, data_test, categories_test, masks_test, charset, charset_cats = load_categories_dataset(
        args.data)

    num_encoder_tokens = len(charset)
    num_decoder_tokens = len(charset_cats)

    model = Seq2SeqRNN()
    if os.path.isfile(args.model):
        model.load(charset, charset_cats, args.model, lstm_size=LSTM_SIZE)
    else:
        raise ValueError("Model file %s doesn't exist" % args.model)

    # setup toolbar
    sys.stdout.write("[%s]" % (" " * args.num_attempts))
    sys.stdout.flush()
    sys.stdout.write(
        "\b" * (args.num_attempts + 1))  # return to start of line, after '['

    for num_attempts in range(0, args.num_attempts):
        target_edge_categories, target_edges = decode_graph(
            model,
            tile_grammar,
            charset,
            args.in_word,
            max_length=data_train.shape[1],
            num_variants=32)

        # for edge, cat in zip(target_edges, target_edge_categories):
        #     reverse_edge = [edge[1], edge[0]]
        #     reverse_cat  = target_edge_categories[target_edges.index(reverse_edge)]
        #     if (cat, reverse_cat) not in category_pairs:
        #         for pair in category_pairs:
        #             if pair[0] == cat:
        #                 node_id = edge[1]
        #                 per_node_cats = [edge_cat[1] for edge_cat in zip(target_edges, target_edge_categories) if edge_cat[0][0] == node_id]
        #                 if pair[1] not in per_node_cats:
        #                     target_edge_categories[target_edges.index(reverse_edge)] = pair[1]
        #                     break
        #             elif pair[1] == reverse_cat:
        #                 if pair[0] == cat:
        #                     node_id = edge[0]
        #                     per_node_cats = [edge_cat[1] for edge_cat in zip(target_edges, target_edge_categories) if edge_cat[0][0] == node_id]
        #                     if pair[0] not in per_node_cats:
        #                         target_edge_categories[target_edges.index(edge)] = pair[0]
        #                         break

        #target_edge_categories, target_edges = file_to_graph_with_categories(random.choice(file_list), cluster_centers, tile_grammar)

        target_str = output_str_a + output_str_b
        for edge in target_edges:
            target_str += str(edge[0]) + " "
        target_str += "\n"
        for edge in target_edges:
            target_str += str(edge[1]) + " "
        target_str += "\n"
        for categ in target_edge_categories:
            target_str += str(categ) + " "
        target_str += "\n"

        #target_str = output_str_a + output_str_b + output_str_b

        filename, ext = os.path.splitext(args.out)
        filename += "_" + str(num_attempts)
        result = obj_tools.string2obj(inputA, inputB, target_str, filename)
        if result == 0:
            sys.stdout.write("\n")
            print("Successfull attempt with target string: ")
            print(target_str)
            break
        elif result == 1:
            sys.stdout.write("\n")
            print(
                "Successfull embedding not strictly according to the target string: "
            )
            print(target_str)
            break

        sys.stdout.write("#")
        sys.stdout.flush()
示例#4
0
def main():
    args = get_arguments()

    file_list = []
    process_folder(args.in_folder, file_list)

    inputA = file_list[0]
    inputB = file_list[len(file_list) - 1]

    initial_smiles_strings = []
    initial_smiles_strings.append(str(obj_tools.obj2string(inputA)))
    initial_smiles_strings.append(str(obj_tools.obj2string(inputB)))
    tile_grammar = grammar.TilingGrammar(initial_smiles_strings)

    cluster_centers, node_types = shape_graph.categorize_edges(
        file_list[:100], tile_grammar, args.out_plot)

    str_node_ids = str(obj_tools.obj2strings_ids(inputA))
    str_node_ids_list = str_node_ids.split("\n")
    smiles_strings = str_node_ids_list[:len(str_node_ids_list) / 2]
    node_ids_list = str_node_ids_list[len(str_node_ids_list) / 2:]

    node_ids = []
    for node_list in node_ids_list:
        node_ids.append([int(i) for i in node_list.split(" ")])

    graph_edges = shape_graph.ShapeGraph(obj_tools.obj2graph(inputA))

    edge_categories = shape_graph.smiles_to_edge_categories(
        smiles_strings[0], node_ids[0], cluster_centers, graph_edges,
        tile_grammar)

    print("smiles string len: ", len(smiles_strings[0]))
    print(smiles_strings[0])
    print("edge categories len: ", len(edge_categories))
    print(edge_categories)

    dummy_node_id = len(node_ids[0])

    padded_node_ids = []
    num_nodes = 0
    for char_id, _ in enumerate(smiles_strings[0]):
        if smiles_strings[0][char_id] in tile_grammar.charset:
            padded_node_ids.append(node_ids[0][num_nodes])
            num_nodes += 1
        else:
            padded_node_ids.append(dummy_node_id)
    padded_node_ids.append(dummy_node_id)  #ensure at least one occurrence

    smiles_variants, node_lists = smiles_variations(smiles_strings[0],
                                                    padded_node_ids,
                                                    tile_grammar, 2)
    print("smiles variants:")
    print(smiles_variants)

    print("node lists:")
    print(node_lists)

    #print("cluster centers:")
    #print(cluster_centers)

    edge_list = tile_grammar.smiles_to_edges(smiles_strings[0],
                                             padded_node_ids)
    print("edge list:")
    print(edge_list)

    all_edge_categories, all_edges = shape_graph.smiles_to_all_edge_categories(
        smiles_strings[0], node_ids[0], cluster_centers, graph_edges,
        tile_grammar)

    if len(all_edge_categories) != len(all_edges):
        print("Error, mismatching number of edges", len(all_edges),
              "and edge categories", len(all_edge_categories))

    output_str = ""
    for edge in all_edges:
        output_str += str(edge[0]) + " "
    output_str += "\n"
    for edge in all_edges:
        output_str += str(edge[1]) + " "
    output_str += "\n"
    for categ in all_edge_categories:
        output_str += str(categ) + " "
    output_str += "\n"

    print("graph embedding output string:")
    print(output_str)
def main():
    args = get_arguments()
    file_list = process_folder(args.in_folder)
    file_list = sorted(file_list)

    input_a = file_list[0]
    input_b = file_list[len(file_list) - 1]

    initial_smiles_strings = []
    initial_smiles_strings.append(str(obj_tools.obj2string(input_a)))
    initial_smiles_strings.append(str(obj_tools.obj2string(input_b)))

    tile_grammar = grammar.TilingGrammar(initial_smiles_strings)

    cluster_centers, _ = shape_graph.categorize_edges(file_list[:100],
                                                      tile_grammar, args.plot)

    num_categories = 0
    categories_prefix = [0]
    for clusters in cluster_centers:
        num_categories += clusters.shape[0]
        categories_prefix.append(num_categories)

    tile_grammar.set_categories_prefix(categories_prefix)
    tile_grammar.store(args.out_grammarpath)

    smiles_strings = []
    edge_categories = []
    edge_cat_min = []
    edge_cat_max = []

    for file_name in file_list:
        str_node_ids = str(obj_tools.obj2strings_ids(file_name))
        if str_node_ids == '':
            continue
        str_node_ids_list = str_node_ids.split("\n")
        initial_strings = str_node_ids_list[:len(str_node_ids_list) / 2]
        node_ids_list = str_node_ids_list[len(str_node_ids_list) / 2:]

        current_strings = []
        if args.remove_cycles:
            for elem in initial_strings:
                current_strings.append(
                    re.sub(
                        "[" + tile_grammar.DIGITS +
                        tile_grammar.NUM_DELIMITER + "]", "", elem))
        else:
            current_strings = initial_strings

        node_ids = []
        for node_list in node_ids_list:
            node_ids.append([int(i) for i in node_list.split(" ")])

        graph_edges = shape_graph.ShapeGraph(obj_tools.obj2graph(file_name))

        for i, _ in enumerate(current_strings):
            dummy_node_id = len(node_ids[0])

            padded_node_ids = []
            num_nodes = 0
            for char_id, _ in enumerate(current_strings[i]):
                if current_strings[i][char_id] in tile_grammar.charset:
                    padded_node_ids.append(node_ids[0][num_nodes])
                    num_nodes += 1
                else:
                    padded_node_ids.append(dummy_node_id)
            padded_node_ids.append(
                dummy_node_id)  #ensure at least one occurrence

            variant_strings, variant_nodes = smiles_variations(
                current_strings[i], padded_node_ids, tile_grammar,
                args.num_variations)
            for word, padded_nodes in zip(variant_strings, variant_nodes):
                nodes = [x for x in padded_nodes if x != dummy_node_id]
                if not args.remove_cycles and not tile_grammar.check_word(
                        word):
                    continue
                if len(str(word)) <= MAX_WORD_LENGTH and len(
                        str(word)) > 0 and word not in smiles_strings:
                    smiles_strings.append(word)
                    current_categories = shape_graph.smiles_to_edge_categories(
                        word, nodes, cluster_centers, graph_edges,
                        tile_grammar)
                    categories_str = ""
                    for cat in current_categories:
                        categories_str += str(cat) + " "
                    edge_categories.append(categories_str[:-1])

                    if len(current_categories) > len(word):
                        print("wrong number of edge categories: ",
                              len(current_categories), " instead of ",
                              len(word))
                        print(word)
                        print(current_categories)

                    category_bounds = tile_grammar.smiles_to_categories_bounds(
                        word)
                    min_bound_str = ""
                    max_bound_str = ""
                    for bounds in category_bounds:
                        min_bound_str += str(bounds[0]) + " "
                        max_bound_str += str(bounds[1]) + " "
                    edge_cat_min.append(min_bound_str[:-1])
                    edge_cat_max.append(max_bound_str[:-1])

    print("# items: " + str(len(smiles_strings)))

    df = pandas.DataFrame({
        args.smiles_column: smiles_strings,
        args.categories_column: edge_categories,
        MIN_BOUND_COL_NAME: edge_cat_min,
        MAX_BOUND_COL_NAME: edge_cat_max
    })
    df.to_hdf(args.out_filepath, "table", format="table", data_columns=True)