Пример #1
0
def parse(tab_file, anno_id, genome_id, db_name, tab_type):
    entries = {}
    file_handler_2 = open(tab_file, "r")
    entry = None

    for line in file_handler_2:
        values = line.split()

        if tab_type[0] == str(values[1]):
            entry = {}
            start = int(values[3])
            stop = int(values[4])
            seq_id = values[2]
            strand = "+"
            name = values[0]

            if start >= stop:
                h = start
                start = stop
                stop = h
                strand = "-"

            start = str(start)
            stop = str(stop)

            entry["seq_lower_coor"] = start
            entry["seq_upper_coor"] = stop
            entry["name"] = name
            entry["seq_id"] = seq_id
            entry["freq_coor"] = start + ";" + stop
            entry["strand"] = strand

            entries[entry["name"]] = entry
    if len(entries) > 0:
        sqlite_methods.saveGFFTypeElementsInDb(entries, db_name, genome_id, anno_id, "")
Пример #2
0
def parse_gff(gff3_file, db_name, allowed_gff3_types, genome_id, anno_id, anno_id_overwrite): #, min_chromosome_length):
    try:
        
        #min_chromosome_length = check_min_chromosome_length_db(db_name, genome_id, min_chromosome_length)
        dic_indexes_parents = {}
        nodes_line_indexes = {}
        
        gff3_types = getGFF3Types(gff3_file)
        gff3_relation_map = getGFF3TypesHierarchy(gff3_file, gff3_types)
        
        for key in gff3_relation_map.keys():
            gff3_relation_map[key] = gff3_relation_map[key].keys()
            
        hierarchy = {}
            
        for allowed_gff3_type1 in allowed_gff3_types:
            for allowed_gff3_type2 in allowed_gff3_types:
                paths = find_all_paths(gff3_relation_map, allowed_gff3_type1, allowed_gff3_type2)
                
                for path in paths:
                    
                    path_possible = True
                    
                    for allowed_gff3_type3 in allowed_gff3_types:
                        try:
                            (path.index(allowed_gff3_type3) == -1)
                        except Exception:
                            path_possible = False
                    
                    if(path_possible):
                        level = 0
                
                        for node in path:
                            level = level + 1
                            if(hierarchy.get(level) == None):
                                hierarchy[level] = {}
                            hierarchy[level][node] = 1
                        assert(len(paths[0]) == len(path))
                    
        gff3_children = hierarchy.get(len(hierarchy))
        gff3_parents = hierarchy.get(len(hierarchy) - 1)
        
        index_list_parents_parents_allowed = extractElementsToRemove(gff3_file, hierarchy)
        index_list_parents = extractElements(gff3_file, gff3_parents, index_list_parents_parents_allowed)
        index_list_parents = extractFragments(gff3_file, gff3_children, index_list_parents)

        sqlite_methods.saveGFFTypeElementsInDb(index_list_parents, db_name, genome_id, anno_id, anno_id_overwrite)
    except Exception:
        print "ERROR", "parse_gff", sys.exc_info()
Пример #3
0
def parse(tab_file, anno_id, genome_id, db_name, tab_type):
    entries = {}
    file_handler_2 = open(tab_file, "r")
    entry = None

    for line in file_handler_2:
        values = line.split()

        if (tab_type[0] == str(values[1])):
            entry = {}
            start = int(values[3])
            stop = int(values[4])
            seq_id = values[2]
            strand = "+"
            name = values[0]

            if (start >= stop):
                h = start
                start = stop
                stop = h
                strand = "-"

            start = str(start)
            stop = str(stop)

            entry["seq_lower_coor"] = start
            entry["seq_upper_coor"] = stop
            entry["name"] = name
            entry["seq_id"] = seq_id
            entry["freq_coor"] = start + ";" + stop
            entry["strand"] = strand

            entries[entry["name"]] = entry
    if (len(entries) > 0):
        sqlite_methods.saveGFFTypeElementsInDb(entries, db_name, genome_id,
                                               anno_id, "")