Exemplo n.º 1
0
def main(argv):
    bedfile = argv[1]
    wigfile = argv[2]
    intersecter = IntervalTree()
    for peak in parse_wig(wigfile):
        intersecter.insert_interval(peak)

    report(intersecter, bedfile)
Exemplo n.º 2
0
def main(argv):
    bedfile = argv[1]
    wigfile = argv[2]
    intersecter = IntervalTree()
    for peak in parse_wig(wigfile):
        intersecter.insert_interval(peak)

    report(intersecter, bedfile)
Exemplo n.º 3
0
    def setUp(self):

        iv = IntervalTree()
        n = 0
        for i in range(1, 1000, 80):
            iv.insert(i, i + 10, dict(value=i * i))
            # add is synonym for insert.
            iv.add(i + 20, i + 30, dict(astr=str(i * i)))

            # or insert/add an interval object with start, end attrs.
            iv.insert_interval(
                Interval(i + 40, i + 50, value=dict(astr=str(i * i))))
            iv.add_interval(
                Interval(i + 60, i + 70, value=dict(astr=str(i * i))))

            n += 4
        self.intervals = self.iv = iv
        self.nintervals = n
Exemplo n.º 4
0
    def setUp(self):

        iv = IntervalTree()
        n = 0
        for i in range(1, 1000, 80):
            iv.insert(i, i + 10, dict(value=i*i))
            # add is synonym for insert.
            iv.add(i + 20, i + 30, dict(astr=str(i*i)))

            # or insert/add an interval object with start, end attrs.
            iv.insert_interval(Interval(i + 40, i + 50,
                value=dict(astr=str(i*i))))
            iv.add_interval(Interval(i + 60, i + 70,
                value=dict(astr=str(i*i))))

            n += 4 
        self.intervals = self.iv = iv
        self.nintervals = n
Exemplo n.º 5
0
def parse_blast(blast_str, qfeat):
  """takes a blast file and cns_pair and sees if the query cns intersects with 
  any of the cns found"""
  scns_inteval = IntervalTree()
  for line in blast_str.split("\n"):
    if "WARNING" in line: continue
    if "ERROR" in line: continue
    line = line.split("\t")
    locus = map(int, line[6:10])
    locus.extend(map(float, line[10:]))
    
    s_start, s_end = locus[:2]
    s_start = min(int(s_start), int(s_end))
    s_end = max(int(s_start), int(s_end))
    scns_inteval.insert_interval(Interval(s_start, s_end))

  q_start = min(int(qfeat['start']), int(qfeat['end']))
  q_end = max(int(qfeat['start']), int(qfeat['end'])) 
  intersecting_cns = scns_inteval.find(q_start, q_end)
  return intersecting_cns
Exemplo n.º 6
0
def _process_piece(filename_vci, contig, reverse):
    """

    :param contig:
    :param reverse:
    :return:
    """
    _chrom = 0
    _pos = 1
    _shared = 2
    _deleted = 3 if not reverse else 4
    _inserted = 4 if not reverse else 3
    _fragment = 5

    ret = {'contig':contig}#, 'tree':None}

    LOG.info("Chromosome: {}".format(contig))
    line_no = 0
    global global_mapping_tree


    tree = IntervalTree()

    try:

        if line_no % 10000 == 0:
            print("CONTIG {} {}".format(contig, line_no))

        pos_from = 0
        pos_to = 0

        tabix_file = pysam.TabixFile(filename_vci)

        iterator = None

        try:
            iterator = tabix_file.fetch(contig, parser=pysam.asTuple())
        except:
            LOG.debug("Exception for {}".format(contig))

        if iterator is None:
            LOG.debug("No iterator")
            return ret

        for rec in iterator:
            if len(rec) != 6:
                raise exceptions.G2GError("Unexpected line in G2G file. Line #{0:,}: {1}".format(line_no, rec))

            if rec[2] == '.':
                continue

            """
            1 3000019 G . A 3000019
            1 3003170 A . T 3151
            1 3003197 A . G 27
            1 3003640 CG  GGGG  . 444
            1 3006790 G AA  . 3145
            1 3006834 G A . 42
            1 3007272 GC  C . 438
            1 3008489 T . ATC 1215
            """

            #LOG.debug(rec)

            fragment = int(rec[_fragment])
            deleted_bases = 0 if rec[_deleted] == '.' else len(rec[_deleted])
            inserted_bases = 0 if rec[_inserted] == '.' else len(rec[_inserted])

            #LOG.debug("pos_from={}, pos_to={}".format(pos_from, pos_to))
            #LOG.debug("Inserting interval {} - {}".format(pos_from, pos_from + fragment))
            interval = Interval(pos_from, pos_from + fragment,
                                IntervalInfo(contig, pos_to, pos_to + fragment, rec[_shared],
                                             rec[_deleted], rec[_inserted], rec[_pos]))

            #LOG.debug(interval)

            tree.insert_interval(interval)

            pos_from += (fragment + deleted_bases)
            pos_to += (fragment + inserted_bases)

            line_no += 1
    except KeyboardInterrupt:
        raise exceptions.KeyboardInterruptError()
    except Exception as e:
        g2g_utils._show_error()
        LOG.error("OOOOOOOPS")

    #ret['tree'] = tree

    #print 'traversing tree...'
    #tree.traverse(d)

    Global = multiprocessing.Manager().Namespace()
    tt = Global.tree

    tt[contig] = tree
    Global.tree = tt


    LOG.debug("PROCESSED {0:,} lines for {1}".format(line_no, contig))

    return ret
Exemplo n.º 7
0
def annotate_igrs(genome, igr_df):
    """
    Annotate the inter-genic regions listed in a dataframe with any available annotations from Rfam

    Parameters
    ----------
    genome: src.data.rfam_db.Genome
        The genome object for the organism who's IGR's are being analyzed
    igr_df: pandas.Dataframe
        The dataframe with the columns 'accession', 'start', 'end', 'length', 'gc'
    Returns
    -------
    annotated_igr_df: pandas.Dataframe
    """

    # Initialize connection to Rfam database
    session = rfam_session()

    # Get the list of "rfamseq_acc" numbers for a given organism
    rfamseq_acc_list = session.query(t_genseq.c.rfamseq_acc).filter(
        t_genseq.c.upid == genome.upid).distinct().all()

    # Create a list to store all the interval trees
    annotation_tree_dict = {}

    for rfamseq_acc in rfamseq_acc_list:

        # Pull rfamseq_acc out of the list
        rfamseq_acc = rfamseq_acc[0]

        rna_query = session.query(t_full_region).filter(
            t_full_region.c.rfamseq_acc == rfamseq_acc)
        rna_list = rna_query.all()

        # Make an interval tree for all of the RNA annotations to allow for rapid overlap search
        annotation_tree = IntervalTree()

        # Go though and add each RNA annotation to the interval tree
        for rna in rna_list:
            start = min(rna.seq_start, rna.seq_end)
            end = max(rna.seq_start, rna.seq_end)

            annotation_interval = Interval(start=start,
                                           end=end,
                                           chrom=rna.rfamseq_acc,
                                           value=rna)
            annotation_tree.insert_interval(annotation_interval)

        rfamseq_acc_stripped = rfamseq_acc.partition('.')[0]
        annotation_tree_dict[rfamseq_acc_stripped] = annotation_tree

    # Make an empty list of all the igrs with annotations
    annotated_igr_list = []
    for accession, accession_igr_df in igr_df.groupby('accession'):
        # Lookup the RNA annotation tree for the given accession
        try:
            annotation_tree = annotation_tree_dict[accession]
        except KeyError:
            print("IGR dataframe key: {} not found. Available keys are: {}".
                  format(accession, annotation_tree_dict.keys()))

        # For each IGR find all of the overlaps with annotated RNAs
        for igr in accession_igr_df.itertuples():

            overlap_list = annotation_tree.find(igr.start, igr.end)
            for overlap in overlap_list:
                # Add the IGR to the annotated_igr_list
                annotated_igr_list.append({
                    'igr_index': igr[0],
                    'rfam_acc': overlap.value.rfam_acc
                })

    # Convert annotated_igr_list into dataframe and merge on the rfam_acc
    annotated_igr_df = pd.merge(igr_df,
                                pd.DataFrame(annotated_igr_list,
                                             columns=["igr_index",
                                                      "rfam_acc"]),
                                on="igr_index",
                                how='left')

    # Look up the information for all of the RNA families represented in this genome
    rna_family_query = session.query(Family)\
                              .with_entities(Family.rfam_acc, Family.rfam_id, Family.description, Family.type)\
                              .filter(Family.rfam_acc.in_(annotated_igr_df["rfam_acc"].dropna().unique()))
    rna_families_df = pd.read_sql(rna_family_query.statement,
                                  rna_family_query.session.bind)

    merged_igr_df = pd.merge(annotated_igr_df,
                             rna_families_df,
                             on="rfam_acc",
                             how="left")

    combined_descriptions = merged_igr_df.dropna().groupby("igr_index")\
                                                  .agg(dict(rfam_acc=lambda x: ','.join(set(x)),
                                                            rfam_id=lambda x: ','.join(set(x)),
                                                            type=lambda x: ','.join(set(x)),
                                                            description=lambda x: '<br>'.join(set(x))))
    merged_igr_df.drop_duplicates(["igr_index"], inplace=True)
    merged_igr_df.reset_index(inplace=True, drop=True)
    merged_igr_df.update(combined_descriptions)

    merged_igr_df["category"] = merged_igr_df.apply(
        lambda row: categorize_igr(row), axis=1)

    merged_igr_df["log_length"] = np.log(merged_igr_df["length"])
    session.close()
    return merged_igr_df
Exemplo n.º 8
0
def process_piece(args):
    filename = args['filename']
    contig = args['contig']
    reverse = args['reverse']

    try:
        _chrom = 0
        _pos = 1
        _shared = 2
        _deleted = 3 if not reverse else 4
        _inserted = 4 if not reverse else 3
        _fragment = 5

        num_lines_chrom = 0
        num_lines_processed = 0

        pos_from = 0
        pos_to = 0

        tree = IntervalTree()

        tabix_file = pysam.TabixFile(filename)
        iterator = tabix_file.fetch(contig, parser=pysam.asTuple())

        LOG.info("Parsing VCI, contig: {}".format(contig))

        for rec in iterator:
            num_lines_chrom += 1

            if len(rec) != 6:
                raise exceptions.G2GError(
                    "Unexpected line in VCI file: {0}".format(rec))

            if rec[2] == '.':
                continue
            """
            1 3000019 G . A 3000019
            1 3003170 A . T 3151
            1 3003197 A . G 27
            1 3003640 CG  GGGG  . 444
            1 3006790 G AA  . 3145
            1 3006834 G A . 42
            1 3007272 GC  C . 438
            1 3008489 T . ATC 1215
            """

            #LOG.debug("||".join(rec))

            fragment = int(rec[_fragment])
            deleted_bases = 0 if rec[_deleted] == '.' else len(rec[_deleted])
            inserted_bases = 0 if rec[_inserted] == '.' else len(
                rec[_inserted])

            #LOG.debug("pos_from={}, pos_to={}".format(pos_from, pos_to))
            #LOG.debug("Inserting interval {} - {}".format(pos_from, pos_from + fragment))
            interval = Interval(
                pos_from, pos_from + fragment,
                IntervalInfo(contig, pos_to, pos_to + fragment, rec[_shared],
                             rec[_deleted], rec[_inserted], rec[_pos]))

            #LOG.debug(interval)

            tree.insert_interval(interval)

            pos_from += (fragment + deleted_bases)
            pos_to += (fragment + inserted_bases)
            num_lines_processed += 1

        #LOG.debug("Parsed {0:,} lines for contig {1} in {2}".format(num_lines_processed, contig, g2g_utils.format_time(cotig_start_time, time.time())))
    except Exception as e:
        LOG.error(e)

    return {'tree': tree, 'contig': contig}