Exemplo n.º 1
0
    def _get_chrbands(self, limit, taxon):
        """
        For the given taxon, it will fetch the chr band file.
        We will not deal with the coordinate information with this parser.
        Here, we only are concerned with building the partonomy.
        :param limit:
        :return:

        """
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        logger.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)

        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:'+taxon

        # add the taxon as a class.  adding the class label elsewhere
        self.gu.addClassToGraph(self.graph, taxon_id, None)
        self.gu.addSynonym(self.graph, taxon_id, genome_label)

        self.gu.loadObjectProperties(self.graph, Feature.object_properties)

        genome_id = geno.makeGenomeID(taxon_id)
        geno.addGenome(taxon_id, genome_label)
        self.gu.addOWLPropertyClassRestriction(
            self.graph, genome_id, Genotype.object_properties['in_taxon'],
            taxon_id)

        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # chr13	4500000	10000000	p12	stalk
                (chrom, start, stop, band, rtype) = line.split('\t')
                line_counter += 1

                # NOTE
                # some less-finished genomes have placed and unplaced scaffolds
                # * Placed scaffolds:
                #    Scaffold has an oriented location within a chromosome.
                # * Unlocalized scaffolds:
                #     scaffold 's chromosome  is known,
                #     scaffold's position, orientation or both is not known.
                # *Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to.

                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1
                placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)'

                # TODO unused
                # unlocalized_scaffold_pattern = \
                #    placed_scaffold_pattern + r'_(\w+)_random'
                # unplaced_scaffold_pattern = r'chrUn_(\w+)'

                m = re.match(placed_scaffold_pattern+r'$', chrom)
                if m is not None and len(m.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    # ch = m.group(1)  # TODO unused
                    pass
                else:
                    # let's skip over anything that isn't a placed_scaffold
                    # at the class level
                    logger.info("Skipping non-placed chromosome %s", chrom)
                    continue
                # the chrom class, taxon as the reference
                cclassid = makeChromID(chrom, taxon, 'CHR')

                # add the chromosome as a class
                geno.addChromosomeClass(chrom, taxon_id, genome_label)
                self.gu.addOWLPropertyClassRestriction(
                    self.graph, cclassid,
                    self.gu.object_properties['member_of'], genome_id)

                # add the band(region) as a class
                maplocclass_id = cclassid+band
                maplocclass_label = makeChromLabel(chrom+band, genome_label)
                if band is not None and band.strip() != '':
                    region_type_id = self.map_type_of_region(rtype)
                    self.gu.addClassToGraph(
                        self.graph, maplocclass_id, maplocclass_label,
                        region_type_id)
                else:
                    region_type_id = Feature.types['chromosome']
                # add the staining intensity of the band
                if re.match(r'g(neg|pos|var)', rtype):
                    if region_type_id in [
                            Feature.types['chromosome_band'],
                            Feature.types['chromosome_subband']]:
                        stain_type = Feature.types.get(rtype)
                        if stain_type is not None:
                            self.gu.addOWLPropertyClassRestriction(
                                self.graph, maplocclass_id,
                                Feature.properties['has_staining_intensity'],
                                Feature.types.get(rtype))
                    else:
                        # usually happens if it's a chromosome because
                        # they don't actually have banding info
                        logger.info("feature type %s != chr band",
                                    region_type_id)
                else:
                    logger.warning('staining type not found: %s', rtype)

                # get the parent bands, and make them unique
                parents = list(self.make_parent_bands(band, set()))
                # alphabetical sort will put them in smallest to biggest
                parents.sort(reverse=True)

                # print("PARENTS of",maplocclass_id,"=",parents)
                # add the parents to the graph, in hierarchical order
                # TODO this is somewhat inefficient due to
                # re-adding upper-level nodes when iterating over the file
                # TODO PYLINT Consider using enumerate
                # instead of iterating with range and len
                for i in range(len(parents)):
                    pclassid = cclassid+parents[i]  # class chr parts
                    pclass_label = \
                        makeChromLabel(chrom+parents[i], genome_label)

                    rti = getChrPartTypeByNotation(parents[i])

                    self.gu.addClassToGraph(
                        self.graph, pclassid, pclass_label, rti)

                    # for canonical chromosomes,
                    # then the subbands are subsequences of the full band
                    # add the subsequence stuff as restrictions
                    if i < len(parents) - 1:
                        pid = cclassid+parents[i+1]   # the instance
                        self.gu.addOWLPropertyClassRestriction(
                            self.graph, pclassid,
                            Feature.object_properties['is_subsequence_of'],
                            pid)
                        self.gu.addOWLPropertyClassRestriction(
                            self.graph, pid,
                            Feature.object_properties['has_subsequence'],
                            pclassid)

                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        self.gu.addOWLPropertyClassRestriction(
                            self.graph, pclassid,
                            Feature.object_properties['is_subsequence_of'],
                            cclassid)
                        self.gu.addOWLPropertyClassRestriction(
                            self.graph, cclassid,
                            Feature.object_properties['has_subsequence'],
                            pclassid)

                # connect the band here to the first one in the parent list
                if len(parents) > 0:
                    self.gu.addOWLPropertyClassRestriction(
                        self.graph, maplocclass_id,
                        Feature.object_properties['is_subsequence_of'],
                        cclassid+parents[0])
                    self.gu.addOWLPropertyClassRestriction(
                        self.graph, cclassid+parents[0],
                        Feature.object_properties['has_subsequence'],
                        maplocclass_id)

                if limit is not None and line_counter > limit:
                    break

        self.gu.loadAllProperties(self.graph)

        # TODO figure out the staining intensities for the encompassing bands

        return
Exemplo n.º 2
0
    def _get_chrbands(self, limit, taxon, genome_id=None):
        """
        For the given taxon, it will fetch the chr band file.
        We will not deal with the coordinate information with this parser.
        Here, we only are concerned with building the partonomy.
        :param limit:
        :param: taxon:
        :param: genome
        :return:

        """
        model = Model(self.graph)
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        LOG.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)

        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:' + taxon

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_id, None)
        model.addSynonym(taxon_id, genome_label)

        if genome_id is None:
            genome_id = geno.makeGenomeID(
                taxon_id)  # makes a blank node always
        geno.addGenome(taxon_id, genome_label, genome_id)
        model.addOWLPropertyClassRestriction(genome_id,
                                             self.globaltt['in taxon'],
                                             taxon_id)

        placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)'
        # currently unused patterns
        # unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random'
        # unplaced_scaffold_pattern = r'chrUn_(\w+)'

        col = ['chrom', 'start', 'stop', 'band', 'rtype']
        with gzip.open(myfile, 'rb') as reader:
            for line in reader:
                line_counter += 1
                # skip comments
                line = line.decode().strip()
                if line[0] == '#':
                    continue
                # chr13	4500000	10000000	p12	stalk
                row = line.split('\t')
                chrom = row[col.index('chrom')]
                band = row[col.index('band')]
                rtype = row[col.index('rtype')]
                # NOTE
                # some less-finished genomes have placed and unplaced scaffolds
                # * Placed scaffolds:
                #    Scaffold has an oriented location within a chromosome.
                # * Unlocalized scaffolds:
                #     scaffold 's chromosome  is known,
                #     scaffold's position, orientation or both is not known.
                # *Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to.
                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1

                mch = re.match(placed_scaffold_pattern + r'$', chrom)
                if mch is not None and len(mch.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    # chrom = m.group(1)  # TODO unused
                    pass
                else:
                    # let's skip over anything that isn't a placed_scaffold
                    # LOG.info("Skipping non-placed chromosome %s", chrom)  # chatty
                    continue
                # the chrom class, taxon as the reference
                cclassid = makeChromID(chrom, taxon, 'CHR')

                # add the chromosome as a class
                geno.addChromosomeClass(chrom, taxon_id, genome_label)
                model.addOWLPropertyClassRestriction(
                    cclassid, self.globaltt['member of'], genome_id)

                # add the band(region) as a class
                maplocclass_id = cclassid + band
                maplocclass_label = makeChromLabel(chrom + band, genome_label)
                if band is not None and band.strip() != '':

                    region_type_id = self.map_type_of_region(rtype)
                    model.addClassToGraph(maplocclass_id, maplocclass_label,
                                          region_type_id)
                else:
                    region_type_id = self.globaltt['chromosome']

                # add the staining intensity of the band
                if re.match(r'g(neg|pos|var)', rtype):
                    if region_type_id in [
                            self.globaltt['chromosome_band'],
                            self.globaltt['chromosome_subband']
                    ]:
                        stain_type = self.resolve(rtype)
                        if stain_type is not None:
                            model.addOWLPropertyClassRestriction(
                                maplocclass_id,
                                self.globaltt['has_sequence_attribute'],
                                self.resolve(rtype))
                    else:
                        # usually happens if it's a chromosome (SO:000340) because
                        # they don't actually have banding info
                        LOG.info("feature type '%s' is not chr band",
                                 self.globaltcid[region_type_id])
                else:
                    LOG.info('staining type not found for: %s', rtype)

                # get the parent bands, and make them unique
                parents = list(self.make_parent_bands(band, set()))
                # alphabetical sort will put them in smallest to biggest
                parents.sort(reverse=True)

                # print("PARENTS of", maplocclass_id, "=", parents)
                # add the parents to the graph, in hierarchical order
                # TODO this is somewhat inefficient due to
                # re-adding upper-level nodes when iterating over the file
                for prnt in parents:
                    parent = prnt.strip()
                    if parent is None or parent == "":
                        continue
                    pclassid = cclassid + parent  # class chr parts
                    pclass_label = makeChromLabel(chrom + parent, genome_label)
                    rti = getChrPartTypeByNotation(parent, self.graph)
                    model.addClassToGraph(pclassid, pclass_label, rti)
                    # for canonical chromosomes,
                    # then the subbands are subsequences of the full band
                    # add the subsequence stuff as restrictions

                    if prnt != parents[-1]:
                        grandparent = 1 + parents.index(prnt)
                        pid = cclassid + parents[grandparent]  # the instance
                        model.addOWLPropertyClassRestriction(
                            pclassid, self.globaltt['is subsequence of'], pid)
                        model.addOWLPropertyClassRestriction(
                            pid, self.globaltt['has subsequence'], pclassid)
                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        model.addOWLPropertyClassRestriction(
                            pclassid, self.globaltt['is subsequence of'],
                            cclassid)
                        model.addOWLPropertyClassRestriction(
                            cclassid, self.globaltt['has subsequence'],
                            pclassid)

                # connect the band here to the first one in the parent list
                if len(parents) > 0:
                    model.addOWLPropertyClassRestriction(
                        maplocclass_id, self.globaltt['is subsequence of'],
                        cclassid + parents[0])
                    model.addOWLPropertyClassRestriction(
                        cclassid + parents[0],
                        self.globaltt['has subsequence'], maplocclass_id)

                if limit is not None and line_counter > limit:
                    break
Exemplo n.º 3
0
    def _process_QTLs_genomic_location(self, raw, taxon_id, build_id, build_label, limit=None):
        """
        This method

        Triples created:

        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        gu = GraphUtils(curie_map.get())
        line_counter = 0
        geno = Genotype(g)
        genome_id = geno.makeGenomeID(taxon_id)  # assume that chrs get added to the genome elsewhere

        eco_id = "ECO:0000061"  # Quantitative Trait Analysis Evidence

        with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            for row in reader:
                line_counter += 1
                if re.match('^#', ' '.join(row)):
                    continue

                (chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame, strand, score, attr) = row

                # Chr.Z   Animal QTLdb    Production_QTL  33954873        34023581        .       .       .
                # QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234;
                # trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass";
                # CMO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian";
                # Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52";
                # Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01"

                # make dictionary of attributes
                # keys are:
                # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,
                # FlankMarkers,VTO_name,Map_Type,Significance,P-value,Model,Test_Base,Variance,
                # Bayes-value,PTO_name,gene_IDsrc,peak_cM,CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect,
                # Dominance_Effect,Likelihood_Ratio,LS-means,Breed,
                # trait (duplicate with Name),Variance,Bayes-value,
                # F-Stat,LOD-score,Additive_Effect,Dominance_Effect,Likelihood_Ratio,LS-means

                # deal with poorly formed attributes
                if re.search('"FlankMarkers";', attr):
                    attr = re.sub('"FlankMarkers";', '', attr)
                attr_items = re.sub('"', '', attr).split(";")
                bad_attr_flag = False
                for a in attr_items:
                    if not re.search('=', a):
                        bad_attr_flag = True
                if bad_attr_flag:
                    logger.error("Poorly formed data on line %d:\n %s", line_counter, '\t'.join(row))
                    continue
                attribute_dict = dict(item.split("=") for item in re.sub('"', '', attr).split(";"))

                qtl_num = attribute_dict.get('QTL_ID')
                if self.testMode and int(qtl_num) not in self.test_ids:
                    continue

                # make association between QTL and trait
                qtl_id = 'AQTL:' + str(qtl_num)
                gu.addIndividualToGraph(g, qtl_id, None, geno.genoparts['QTL'])
                geno.addTaxon(taxon_id, qtl_id)

                trait_id = 'AQTLTrait:'+attribute_dict.get('trait_ID')

                # if pub is in attributes, add it to the association
                pub_id = None
                if 'PUBMED_ID' in attribute_dict.keys():
                    pub_id = attribute_dict.get('PUBMED_ID')
                    if re.match('ISU.*', pub_id):
                        pub_id = 'AQTLPub:' + pub_id.strip()
                        p = Reference(pub_id)
                    else:
                        pub_id = 'PMID:' + pub_id.strip()
                        p = Reference(pub_id, Reference.ref_types['journal_article'])
                    p.addRefToGraph(g)

                # Add QTL to graph
                assoc = G2PAssoc(self.name, qtl_id, trait_id, gu.object_properties['is_marker_for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)
                if 'P-value' in attribute_dict.keys():
                    score = float(re.sub('<', '', attribute_dict.get('P-value')))
                    assoc.set_score(score)

                assoc.add_association_to_graph(g)
                # TODO make association to breed (which means making QTL feature in Breed background)

                # get location of QTL
                chromosome = re.sub('Chr\.', '', chromosome)
                chrom_id = makeChromID(chromosome, taxon_id, 'CHR')

                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(chromosome, build_id, build_label, chrom_id)
                qtl_feature = Feature(qtl_id, None, geno.genoparts['QTL'])
                if start_bp == '':
                    start_bp = None
                qtl_feature.addFeatureStartLocation(start_bp, chrom_in_build_id, strand,
                                                    [Feature.types['FuzzyPosition']])
                if stop_bp == '':
                    stop_bp = None
                qtl_feature.addFeatureEndLocation(stop_bp, chrom_in_build_id, strand,
                                                  [Feature.types['FuzzyPosition']])
                qtl_feature.addTaxonToFeature(g, taxon_id)
                qtl_feature.addFeatureToGraph(g)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        logger.info("Done with QTL genomic mappings for %s", taxon_id)
        return
Exemplo n.º 4
0
    def _get_chrbands(self, limit, taxon):
        """
        For the given taxon, it will fetch the chr band file.
        We will not deal with the coordinate information with this parser.
        Here, we only are concerned with building the partonomy.
        :param limit:
        :return:

        """
        model = Model(self.graph)
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        logger.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)

        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:' + taxon

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_id, None)
        model.addSynonym(taxon_id, genome_label)

        genome_id = geno.makeGenomeID(taxon_id)
        geno.addGenome(taxon_id, genome_label)
        model.addOWLPropertyClassRestriction(
            genome_id, self.globaltt['in taxon'],
            taxon_id)

        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # chr13	4500000	10000000	p12	stalk
                (chrom, start, stop, band, rtype) = line.split('\t')
                line_counter += 1

                # NOTE
                # some less-finished genomes have placed and unplaced scaffolds
                # * Placed scaffolds:
                #    Scaffold has an oriented location within a chromosome.
                # * Unlocalized scaffolds:
                #     scaffold 's chromosome  is known,
                #     scaffold's position, orientation or both is not known.
                # *Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to.

                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1
                placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)'

                # TODO unused
                # unlocalized_scaffold_pattern = \
                #    placed_scaffold_pattern + r'_(\w+)_random'
                # unplaced_scaffold_pattern = r'chrUn_(\w+)'

                m = re.match(placed_scaffold_pattern+r'$', chrom)
                if m is not None and len(m.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    # ch = m.group(1)  # TODO unused
                    pass
                else:
                    # let's skip over anything that isn't a placed_scaffold
                    # at the class level
                    logger.info("Skipping non-placed chromosome %s", chrom)
                    continue
                # the chrom class, taxon as the reference
                cclassid = makeChromID(chrom, taxon, 'CHR')

                # add the chromosome as a class
                geno.addChromosomeClass(chrom, taxon_id, genome_label)
                model.addOWLPropertyClassRestriction(
                    cclassid, self.globaltt['member of'], genome_id)

                # add the band(region) as a class
                maplocclass_id = cclassid+band
                maplocclass_label = makeChromLabel(chrom+band, genome_label)
                if band is not None and band.strip() != '':
                    region_type_id = self.map_type_of_region(rtype)
                    model.addClassToGraph(
                        maplocclass_id, maplocclass_label,
                        region_type_id)
                else:
                    region_type_id = self.globaltt['chromosome']
                # add the staining intensity of the band
                if re.match(r'g(neg|pos|var)', rtype):
                    if region_type_id in [
                            self.globaltt['chromosome_band'],
                            self.globaltt['chromosome_subband']]:
                        stain_type = self.resolve(rtype)
                        if stain_type is not None:
                            model.addOWLPropertyClassRestriction(
                                maplocclass_id,
                                self.globaltt['has_sequence_attribute'],
                                self.resolve(rtype))
                    else:
                        # usually happens if it's a chromosome because
                        # they don't actually have banding info
                        logger.info("feature type %s != chr band",
                                    region_type_id)
                else:
                    logger.warning('staining type not found: %s', rtype)

                # get the parent bands, and make them unique
                parents = list(self.make_parent_bands(band, set()))
                # alphabetical sort will put them in smallest to biggest
                parents.sort(reverse=True)

                # print("PARENTS of",maplocclass_id,"=",parents)
                # add the parents to the graph, in hierarchical order
                # TODO this is somewhat inefficient due to
                # re-adding upper-level nodes when iterating over the file
                # TODO PYLINT Consider using enumerate
                # instead of iterating with range and len
                for i in range(len(parents)):
                    parent_i = parents[i].strip()
                    if parent_i is not None and parent_i != "":
                        pclassid = cclassid + parent_i # class chr parts
                        pclass_label = makeChromLabel(chrom + parent_i, genome_label)
                        rti = getChrPartTypeByNotation(parent_i, self.graph)
                        model.addClassToGraph(pclassid, pclass_label, rti)

                    # for canonical chromosomes,
                    # then the subbands are subsequences of the full band
                    # add the subsequence stuff as restrictions
                    if i < len(parents) - 1:
                        pid = cclassid+parents[i+1]   # the instance
                        model.addOWLPropertyClassRestriction(
                            pclassid, self.globaltt['is subsequence of'], pid)
                        model.addOWLPropertyClassRestriction(
                            pid, self.globaltt['has subsequence'],  pclassid)

                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        model.addOWLPropertyClassRestriction(
                            pclassid, self.globaltt['is subsequence of'], cclassid)
                        model.addOWLPropertyClassRestriction(
                            cclassid, self.globaltt['has subsequence'],  pclassid)

                # connect the band here to the first one in the parent list
                if len(parents) > 0:
                    model.addOWLPropertyClassRestriction(
                        maplocclass_id, self.globaltt['is subsequence of'],
                        cclassid+parents[0])
                    model.addOWLPropertyClassRestriction(
                        cclassid+parents[0],  self.globaltt['has subsequence'],
                        maplocclass_id)

                if limit is not None and line_counter > limit:
                    break

        # TODO figure out the staining intensities for the encompassing bands

        return
Exemplo n.º 5
0
    def _get_chrbands(self, limit, taxon):
        """
        For the given taxon, it will fetch the chr band file.
        We will not deal with the coordinate information with this parser.
        Here, we only are concerned with building the partonomy.
        :param limit:
        :return:

        """
        model = Model(self.graph)
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        LOG.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)

        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:' + taxon

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_id, None)
        model.addSynonym(taxon_id, genome_label)

        genome_id = geno.makeGenomeID(taxon_id)
        geno.addGenome(taxon_id, genome_label)
        model.addOWLPropertyClassRestriction(
            genome_id, self.globaltt['in taxon'], taxon_id)

        placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)'
        # currently unused patterns
        # unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random'
        # unplaced_scaffold_pattern = r'chrUn_(\w+)'

        col = ['chrom', 'start', 'stop', 'band', 'rtype']
        with gzip.open(myfile, 'rb') as reader:
            for line in reader:
                line_counter += 1
                # skip comments
                line = line.decode().strip()
                if line[0] == '#':
                    continue
                # chr13	4500000	10000000	p12	stalk
                row = line.split('\t')
                chrom = row[col.index('chrom')]
                band = row[col.index('band')]
                rtype = row[col.index('rtype')]
                # NOTE
                # some less-finished genomes have placed and unplaced scaffolds
                # * Placed scaffolds:
                #    Scaffold has an oriented location within a chromosome.
                # * Unlocalized scaffolds:
                #     scaffold 's chromosome  is known,
                #     scaffold's position, orientation or both is not known.
                # *Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to.
                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1

                mch = re.match(placed_scaffold_pattern+r'$', chrom)
                if mch is not None and len(mch.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    # chrom = m.group(1)  # TODO unused
                    pass
                else:
                    # let's skip over anything that isn't a placed_scaffold
                    LOG.info("Skipping non-placed chromosome %s", chrom)
                    continue
                # the chrom class, taxon as the reference
                cclassid = makeChromID(chrom, taxon, 'CHR')

                # add the chromosome as a class
                geno.addChromosomeClass(chrom, taxon_id, genome_label)
                model.addOWLPropertyClassRestriction(
                    cclassid, self.globaltt['member of'], genome_id)

                # add the band(region) as a class
                maplocclass_id = cclassid+band
                maplocclass_label = makeChromLabel(chrom+band, genome_label)
                if band is not None and band.strip() != '':
                    region_type_id = self.map_type_of_region(rtype)
                    model.addClassToGraph(
                        maplocclass_id, maplocclass_label,
                        region_type_id)
                else:
                    region_type_id = self.globaltt['chromosome']
                # add the staining intensity of the band
                if re.match(r'g(neg|pos|var)', rtype):
                    if region_type_id in [
                            self.globaltt['chromosome_band'],
                            self.globaltt['chromosome_subband']]:
                        stain_type = self.resolve(rtype)
                        if stain_type is not None:
                            model.addOWLPropertyClassRestriction(
                                maplocclass_id,
                                self.globaltt['has_sequence_attribute'],
                                self.resolve(rtype))
                    else:
                        # usually happens if it's a chromosome because
                        # they don't actually have banding info
                        LOG.info("feature type %s != chr band", region_type_id)
                else:
                    LOG.warning('staining type not found: %s', rtype)

                # get the parent bands, and make them unique
                parents = list(self.make_parent_bands(band, set()))
                # alphabetical sort will put them in smallest to biggest
                parents.sort(reverse=True)

                # print("PARENTS of", maplocclass_id, "=", parents)
                # add the parents to the graph, in hierarchical order
                # TODO this is somewhat inefficient due to
                # re-adding upper-level nodes when iterating over the file
                for prnt in parents:
                    parent = prnt.strip()
                    if parent is None or parent == "":
                        continue
                    pclassid = cclassid + parent  # class chr parts
                    pclass_label = makeChromLabel(chrom + parent, genome_label)
                    rti = getChrPartTypeByNotation(parent, self.graph)
                    model.addClassToGraph(pclassid, pclass_label, rti)

                    # for canonical chromosomes,
                    # then the subbands are subsequences of the full band
                    # add the subsequence stuff as restrictions

                    if prnt != parents[-1]:
                        grandparent = 1 + parents.index(prnt)
                        pid = cclassid + parents[grandparent]   # the instance
                        model.addOWLPropertyClassRestriction(
                            pclassid, self.globaltt['is subsequence of'], pid)
                        model.addOWLPropertyClassRestriction(
                            pid, self.globaltt['has subsequence'], pclassid)
                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        model.addOWLPropertyClassRestriction(
                            pclassid, self.globaltt['is subsequence of'], cclassid)
                        model.addOWLPropertyClassRestriction(
                            cclassid, self.globaltt['has subsequence'], pclassid)

                # connect the band here to the first one in the parent list
                if len(parents) > 0:
                    model.addOWLPropertyClassRestriction(
                        maplocclass_id, self.globaltt['is subsequence of'],
                        cclassid + parents[0])
                    model.addOWLPropertyClassRestriction(
                        cclassid + parents[0], self.globaltt['has subsequence'],
                        maplocclass_id)

                if limit is not None and line_counter > limit:
                    break

        # TODO figure out the staining intensities for the encompassing bands

        return