def get_sequences():
    fasta_filename = '/scratch/indexes/WS235.fa'
    sequences = dict((p.name.split(' ')[0], p.seq)
                     for p in HTSeq.FastaReader(fasta_filename))
    rc_sequences = dict((p.name.split(' ')[0], rc(p.seq))
                        for p in HTSeq.FastaReader(fasta_filename))
    chr_lens = dict([(name, len(sequences[name])) for name in sequences])
    return (sequences, rc_sequences, chr_lens)
示例#2
0
def test_fasta_parser():
    print("Test Fasta parser")
    for seq in HTSeq.FastaReader('example_data/fastaExLong.fa'):
        pass
    print("Test passed")
    print("Test Fasta parser (raw iterator)")
    for seq in HTSeq.FastaReader('example_data/fastaExLong.fa',
                                 raw_iterator=True):
        pass
    print("Test passed")
示例#3
0
def gatherAllQueries(queryPath):

    alleleList = {}
    bestmatches = {}

    try:
        queryFilesOnDir = [ f for f in listdir(queryPath) if isfile(join(queryPath,f)) ]
        countFiles = 0
        for queryFile in queryFilesOnDir:
            AllqueryFile = os.path.join(queryPath,queryFile)
            if queryFile == 'Allalleles.fasta':
                continue

            countFiles += 1
            g_fp = HTSeq.FastaReader(os.path.join(queryPath,queryFile))
            countAlleles = 0
            for allele in g_fp:
                countAlleles += 1
                #ToWrite.append(">" + str(countFiles) + '--' + str(countAlleles) +"\n"+ str(allele.seq).upper() + "\n")
                alleleList[str(countFiles) + '--' + str(countAlleles)] = str(allele.seq).upper()

            bestmatches[str(countFiles)] = [0,0,False,'','','',0,'', str(countAlleles), AllqueryFile] #To be used when searching for new alleles. On instance for each locus
                                            #Score, ScoreRatio, Found, queryName, HitName, MatchObject, lengthReference, lengthQuery, numberOfExistingAllelesForThatLocus

        #CreateNewAlleleFile(os.path.join(queryPath,'Allalleles.fasta'), ToWrite)
    except Exception:
        print 'An error occurred'
        return False , alleleList, bestmatches
    
    return True , alleleList, bestmatches
示例#4
0
def create_tables(cims=True):
    # Load some library files.
    print "create_tables() called."
    fasta_filename = '/scratch/indexes/WS235.fa'
    sequences = dict((p.name.split(' ')[0], p.seq)
                     for p in HTSeq.FastaReader(fasta_filename))
    gtf_df = pandas.read_csv('../clip/lib/gtf_with_biotype_column.txt',
                             sep='\t')
    if cims:
        globstr = 'cims_out/*'
        cits_option = False
        fdr = 0.001
        table_dir = 'cims_tables/'
    else:
        create_cits_tables(sequences, gtf_df)
        return
    for filename in glob.glob(globstr):
        # Apply filter also renames columns.
        print "create_tables(): %s" % filename
        print "Loading peaks..."
        peaks = pandas.read_csv(filename, sep='\t')
        #peaks = peaks.head()
        rename_columns(peaks)
        peaks = apply_filter(peaks, cits=cits_option, fdr=fdr)
        assign_cims_cits_to_gene.assign_table(peaks,
                                              gtf_df=gtf_df,
                                              given_gtf=True)
        annotate_peaks_with_gene_type.annotate_peaks_with_gene_type(
            peaks, gtf_filename='../clip/lib/gtf_with_biotype_column.txt')
        peaks = peaks[peaks['biotype'] == 'protein_coding']
        get_sequences_for_table(peaks, sequences, expand=10)
        write_fasta(peaks, 'fasta/' + os.path.basename(filename))
        peaks.sort('height', ascending=0, inplace=True)
        write_peaks_table(peaks, filename, tables_folder=table_dir)
        print "Finished processing..."
示例#5
0
def read_uniprot_to_dic(in_file, mode="full"):
    """
    Reads a uniprot database and stores the identifier and sequence in a dic

    Paramters:
    ---------------------
        fasta_db: str,
                 file location for the fasta database

    mode: str,
          Either "full" or "seq". "seq" only stores the sequence while "full"
          stores the whole sequence object including name, description etc.

    Returns:
    -------------------------
    db_dic: dict,
            <key:value> with <uniprot_id>: Sequence
    """
    fasta = HTSeq.FastaReader(in_file)
    uniprot_dic = {}

    if mode == "seq":
        for seq in fasta:
            uniprot_dic[get_uniprot(seq.name)] = seq.seq
    elif mode == "full":
        for seq in fasta:
            uniprot_dic[get_uniprot(seq.name)] = seq
    else:
        print "Error! Unsupported mode: %s" % mode

    return(uniprot_dic)
示例#6
0
def getFASTAarray(FASTAfile, genomeArray):
    g_fp = HTSeq.FastaReader(FASTAfile)
    countContigs=0
    for contig in g_fp:
        countContigs+=1
        genomeArray[str(countContigs)]=contig.seq
    return genomeArray
示例#7
0
def add_minus_three_c_column(peaks):
    if 'seq' not in peaks.columns:
        fasta_filename = '/scratch/indexes/WS235.fa'
        sequences = dict(
            (p.name.split(' ')[0], p.seq) for p in HTSeq.FastaReader(fasta_filename))
        get_sequences_for_table(peaks, sequences, expand=10)
    peaks['minus_three_c'] = 0
    peaks['minus_four_c'] = 0
    peaks['tgt'] = 0
    peaks['has_fbe'] = 0
    peaks['seq'] = [x.lower() for x in peaks['seq'].tolist()]
    for index, row in peaks.iterrows():
        if re.search('tgt\w\w\wat', peaks.loc[index, 'seq']) is not None:
            peaks.loc[index, 'has_fbe'] = 1
        else:
            peaks.loc[index, 'has_fbe'] = 0
        if re.search('c\w\wtgt\w\w\wat', peaks.loc[index, 'seq']) is not None:
            peaks.loc[index, 'minus_three_c'] = 1
        else: peaks.loc[index, 'minus_three_c'] = 0
        if re.search('c\w\w\wtgt\w\w\wat', peaks.loc[index, 'seq']) is not None:
            peaks.loc[index, 'minus_four_c'] = 1
        else: peaks.loc[index, 'minus_four_c'] = 0
        if re.search('tgt', peaks.loc[index, 'seq']) is not None:
            peaks.loc[index, 'tgt'] = 1
        else: peaks.loc[index, 'tgt'] = 0
        if re.search('ctgt\w\w\wat', peaks.loc[index, 'seq']) is not None:
            peaks.loc[index, 'minus_one_c'] = 1
        else: peaks.loc[index, 'minus_one_c'] = 0
        if re.search('c\wtgt\w\w\wat', peaks.loc[index, 'seq']) is not None:
            peaks.loc[index, 'minus_two_c'] = 1
        else: peaks.loc[index, 'minus_two_c'] = 0
    return peaks
示例#8
0
文件: read_utils.py 项目: xtmgah/SVE
def read_fasta_chrom(fasta_path, chrom):
    ss = ''
    for s in ht.FastaReader(fasta_path):
        if s.name == chrom:
            ss = s
            return ss
    return ss
def concatAllQueries(queryPath, maxBP, maxalleles):


    try:
        fg = open(os.path.join(queryPath,'Allalleles.fasta'),'w')
        fg.close()
        queryFilesOnDir = [ f for f in listdir(queryPath) if isfile(join(queryPath,f)) ]
        countAlleles = 0
        ToWrite = []
        for queryFile in queryFilesOnDir:
            if queryFile == 'Allalleles.fasta':
                ToWrite = []
                continue
            ToWrite = []
            g_fp = HTSeq.FastaReader(os.path.join(queryPath,queryFile))

            count = 0
        
            for allele in g_fp:
                if maxalleles != None and int(maxalleles) == count:
                    break
                else:
                    if maxBP != None:
                        if len(str(allele.seq)) > maxBP:
                            continue
                    countAlleles += 1
                    ToWrite.append(">" + str(countAlleles) +"\n"+ str(allele.seq).upper() + "\n")

                count+=1
            CreateNewAlleleFile(os.path.join(queryPath,'Allalleles.fasta'), ToWrite)
    except Exception:
        print 'An error occurred'
        return False
    
    return True
示例#10
0
def CreateQueryDatabase(FASTAfile, databasePath,queryProteomeName):
    gene_fp = HTSeq.FastaReader(FASTAfile)
    names=""
    alleleProt=''
    proteome=""
    isEmpty = True
    countAlleles = 0
    for allele in gene_fp: #new db for each allele to blast it against himself
        try:
            x = str(translateSeq(allele.seq))
            countAlleles+=1
            isEmpty = False
        except:
            print 'Could not translate'
            if countAlleles==0:
                isEmpty = True
            continue
        alleleProt+=">"+str(allele.name)+"\n"+x+"\n"
        proteome+=">"+str(allele.name)+"\n"+x+"\n"
    # with open(pathRef+'allAllelesAA.fasta', "wb") as f:
    #     f.write(alleleProt)
    databasePath = os.path.join(databasePath,queryProteomeName)
    databasePath = databasePath.split('.')[0]
    databasePath = databasePath+'_db'
    with open(queryProteomeName, "wb") as v:
        v.write(proteome)
    Gene_Blast_DB_name = Create_Blastdb(queryProteomeName,1,True, databasePath)
    
    return databasePath, isEmpty
示例#11
0
文件: read_utils.py 项目: xtmgah/SVE
def read_fasta_substring(fasta_path, chrom, pos, end):
    ss = ''
    for s in ht.FastaReader(fasta_path):
        if s.name == chrom:
            ss = s
            return ss[pos:end]  #short circuit
    return ss
示例#12
0
文件: read_utils.py 项目: xtmgah/SVE
def read_fasta(fasta_path, dictionary=False, trimN=False):
    ref = None
    if dictionary:
        ref = dict((s.name, s) for s in ht.FastaReader(fasta_path))
    else:
        ss = []
        for s in ht.FastaReader(fasta_path):
            ss += [s]
        ref = ss
    if trimN:
        if dictionary:
            for k in ref:
                ref[k].seq = ref[k].seq.replace('N', '')
        else:
            for i in range(0, len(ref)):
                ref[i].seq = ref[k].seq.replace('N', '')
    return ref
示例#13
0
def fasta_to_dataframe(infile, idindex=0):
    """Get fasta proteins into dataframe"""

    keys = ['name', 'sequence', 'description']
    fastafile = HTSeq.FastaReader(infile)
    data = [(s.name, s.seq.decode(), s.descr) for s in fastafile]
    df = pd.DataFrame(data, columns=(keys))
    df.set_index(['name'], inplace=True)
    return df
示例#14
0
def CanProVar_to_table(in_file, out_folder):
    """
    Writes the CanProvar results from the fasta DB to a table
    and returns the indexed pandas dataframe
    """
    fasta = HTSeq.FastaReader(in_file)
    ensemble_id = []
    dbsnp_ids = []
    FROM = []
    TO = []
    POS = []
    ID = []
    native_id = []

    #iterate over the fasta file (CanProvar Format)
    #and get the mutations that are written to the description
    #create mutation tags and store them in a dataframe
    for seq in fasta:
        #multiple mutations are seperated by ; in the source file
        split = seq.descr.split(";")
        if split[0] != '':
            # for all mutations generate the specific tag
            # i.e. FROM, TO, POS = A,D, 20
            # keep track of the identifier etc...
            for mutation in split:
                single_mut = mutation.split(":")
                pos = int(re.search("(\d+)", single_mut[1]).groups()[0])
                muts = re.search("([A-Z*-]+)\d+([A-Z*-]+)",
                                 single_mut[1]).groups()
                POS.append(pos)
                FROM.append(muts[0])
                TO.append(muts[1])
                ensemble_id.append(seq.name)
                dbsnp_ids.append(single_mut[0])
                native_id.append(single_mut[1])
                ID.append("CanProVar")

    # convert lists to dataframe
    canprovar_df = pd.DataFrame()
    canprovar_df["FROM"] = FROM
    canprovar_df["TO"] = TO
    canprovar_df["POS"] = POS
    canprovar_df["ID"] = ID
    canprovar_df["native_id"] = native_id
    canprovar_df["ensemble_id"] = ensemble_id
    canprovar_df["dbsnp_ids"] = dbsnp_ids

    # extract only the ensemble_ids
    ensemble_ids = pd.DataFrame()
    ensemble_ids["ids"] = np.unique(canprovar_df["ensemble_id"])
    ensemble_ids.to_csv(out_folder + "canprovar_ensemble_ids.csv", sep="\t")

    #index dataframe to be adressed by the ensemble id via
    #canprovar_df.loc["ENSP00000370532"]
    canprovar_df.to_csv(out_folder + "canprovar_tab.csv", sep="\t")
    canprovar_df = canprovar_df.set_index("ensemble_id")
    return (canprovar_df)
示例#15
0
def fasta_to_dict(fasta_filename):
    ## parse the result of fastq_to_unique_fasta.py and
    ## return a dictionary with the amount of reads
    ## for each sequence tag
    n_seqtags = defaultdict(int)
    for s in HTSeq.FastaReader(fasta_filename):
        n_seqtags[s.seq] = int(s.name.split('_')[-1].replace('x', ''))

    return n_seqtags
def chromosome_names_and_lengths_from_fasta(fasta_fname):
    sequences = dict(
        (s[1], s[0])
        for s in HTSeq.FastaReader(fasta_fname, raw_iterator=True))
    sequences = {chrom_name: len(seq) for chrom_name, seq in sequences.items()}
    with open(
            PurePath(os.path.dirname(fasta_fname),
                     os.path.basename(fasta_fname) + '.chrom_lengths'),
            'w') as f:
        f.write('\n'.join([f'{k}\t{v}' for k, v in sequences.items()]))
示例#17
0
def filter_fasta(infile):

    fastafile = HTSeq.FastaReader(infile)
    sequences = [(s.name, s.seq, s.descr) for s in fastafile]
    out = open('filtered.fa', "w")
    for s in sequences:
        if s[1] == 'Sequence unavailable':
            continue
        myseq = HTSeq.Sequence(s[1], s[0])
        myseq.write_to_fasta_file(out)
    return
示例#18
0
def get_sequences(combined):
    fasta_filename = 'lib/c_elegans.WS235.genomic.fa'
    sequences = dict((re.sub('CHROMOSOME_', '', p.name), p.seq) for p in HTSeq.FastaReader(fasta_filename))
    for index, peak_row in combined.iterrows():
        start = combined.loc[index, 'left']
        end = combined.loc[index, 'right']
        chrm = combined.loc[index, 'chrm']
        seq = sequences[chrm][start:end]
        if combined.loc[index, 'strand'] == '-':
            seq = rc(seq)
        combined.loc[index, 'seq'] = seq
示例#19
0
def spliting_referance(referance, destination):
    reads = HTSeq.FastaReader(AMPLICON_FASTA)

    list_of_amplicon = []
    for read in reads:
        fasta_file_name = os.path.join(destination, read.name.strip() + ".fa")
        list_of_amplicon.append(fasta_file_name)
        with open(fasta_file_name, "w") as f:
            read.write_to_fasta_file(f)

    bb_list_of_amplicon = ",".join(list_of_amplicon)
    return bb_list_of_amplicon
示例#20
0
def openDNA(filename):
	extension=os.path.splitext(filename)[1]
	if extension in ['.fna','.fasta','.ffn','.faa','.frn']:
		print('File '+filename+' is type FastA.')
		file=HTSeq.FastaReader(filename)
		num_lines=sum(1 for line in open(filename))
	elif extension in ['.fq','.fastq']:
		print('File '+filename+' is type FastQ.')
		file=HTSeq.FastqReader(filename)
		num_lines=int(sum(1 for line in open(filename))/4) #1/4 of lines are sequencesy in fastQ
	else: raise Exception('Unknown file type, exiting.')
	return file, num_lines
示例#21
0
def collapse_reads(infile, outfile=None, min_length=15):
    """Collapse identical reads, writing collapsed reads to a new fasta file.
      Retains copy number in fasta headers. Each sequence in the resulting file
      should be unique.

    Args:
        infile: input fastq file
        outfile: output fasta file with collapsed reads
        min_length: minimum length of read to include
    Returns:
        True if successful, otherwise False
    """

    #from itertools import islice
    if outfile == None:
        outfile = os.path.splitext(infile)[0] + '_collapsed.fa'
    print('collapsing reads %s' % infile)
    ext = os.path.splitext(infile)[1]
    if ext == '.fastq':
        fastfile = HTSeq.FastqReader(infile, "solexa")
    elif ext == '.fa' or ext == '.fasta':
        fastfile = HTSeq.FastaReader(infile)
    else:
        print('not fasta or fastq')
        return False

    i = 0
    total = 0
    f = {}
    #print (fastfile)
    for s in fastfile:
        seq = s.seq.decode()
        if seq in f:
            f[seq]['reads'] += 1
        else:
            f[seq] = {'name': s.name, 'reads': 1}
        total += 1

    df = pd.DataFrame.from_dict(f, orient='index')
    df.index.name = 'seq'
    df = df.reset_index()
    l = df.seq.str.len()
    df = df[l >= min_length]
    df = df.drop(['name'], 1)
    df = df.sort_values(by='reads', ascending=False).reset_index()
    df['read_id'] = df.index.copy()
    df['read_id'] = df.apply(lambda x: str(x.read_id) + '_' + str(x.reads), 1)
    #print df[:10]
    utils.dataframe_to_fasta(df, idkey='read_id', outfile=outfile)
    #df.to_csv(os.path.splitext(outfile)[0]+'.csv', index=False)
    print('collapsed %s reads to %s' % (total, len(df)))
    return True
示例#22
0
def readdna(filename):
    """
    Reads in the dna sequence of the given fasta

    @type  filename: string
    @param filename: Fasta-file used as input.
    @rtype:   HTSeq Sequence object
    @return:  Reference Fasta.
    """
    chr = HTSeq.FastaReader(filename)
    for fasta in chr:
        referenz = HTSeq.Sequence(fasta.seq, fasta.name)
    return (referenz)
示例#23
0
def adjust_peak_width(input_folder, table_dir='cims_alt_tables/'):
    fasta_filename = '/scratch/indexes/WS235.fa'
    if not os.path.exists(table_dir):
        os.system('mkdir ' + table_dir)
    sequences = dict((p.name.split(' ')[0], p.seq)
                     for p in HTSeq.FastaReader(fasta_filename))
    for filename in glob.glob(input_folder + '/*'):
        peaks = pandas.read_csv(filename, sep='\t')
        print peaks.head()
        get_sequences_for_table(peaks, sequences, expand=10, max_width=False)
        write_fasta(peaks, 'fasta/' + os.path.basename(filename))
        peaks.sort('height', ascending=0, inplace=True)
        write_peaks_table(peaks, filename, tables_folder=table_dir)
示例#24
0
def build_fa_blob(target, source, env):
    d = {}
    target = str(target[0])
    source = str(source[0])

    with open(source, "r") as f:
        stream = HTSeq.FastaReader(f)

        for entry in stream:
            d[entry.name] = entry.seq

    with open(target, "wb") as f:
        pickle.dump(d, f)
示例#25
0
    def readSequences(self, args):

        for fastaFile in args.fasta:

            if not fileExists(fastaFile):
                raise PSToolException("Fasta file does not exist: " +
                                      str(fastaFile))

        self.fasta = {}

        for fastaFile in args.fasta:

            for seq in HTSeq.FastaReader(fastaFile):
                self.fasta[seq.name] = MinimalSeq(seq.seq, seq.name, seq.descr)
示例#26
0
def returnSequence(fasta):
    """
    Returns a sequence string from a fasta file.


    @type  fasta: string
    @param fasta: path to fasta file.

    @rtype:   string
    @return:  sequence
    """
    fastafile = HTSeq.FastaReader(fasta)
    for sequence in fastafile:
        return (sequence.seq)
示例#27
0
def get_sequences(combined):
    #fasta_filename = '/home/dp/Desktop/celegans_genome/wormbase_ws235/c_elegans.WS235.genomic.fa'
    fasta_filename = 'lib/c_elegans.WS235.genomic.fa'
    sequences = dict((re.sub('CHROMOSOME_', '', p.name), p.seq)
                     for p in HTSeq.FastaReader(fasta_filename))
    for index, peak_row in combined.iterrows():
        start = combined.loc[index, 'left']
        end = combined.loc[index, 'right']
        chrm = combined.loc[index, 'chrm']
        seq = sequences[chrm][start:end]
        #print "%s:%i-%i: seq %s" % (chrm, start, end, seq)
        if combined.loc[index, 'strand'] == '-':
            seq = rc(seq)
        combined.loc[index, 'seq'] = seq
示例#28
0
def generate_seq_stats(seqfile, header, table=None, fastqfile=True):
    '''
    This function creates the JSON-files table.j, hist.j, edges.j, which are the basis for the sequence statistics table and graph visualized in the Sequence distribution-tab.
    If no table object is provided, headers are created and a table object is returned with two columns, headers and values. 
    If a table object is provided, the function will add a new column to the table

    table: existing table (for adding a column)
    seqfile: path to sequencefile (fasta/fastq)
    header: name of column
    fastqfile: the function assumes a fastq file. "False" will accept fasta
    '''
    if not table:
        table = {
            'Statistic': [
                'Count (#)', 'Length (bp)', 'Over 100 bp', 'Over 500 bp',
                'Over 1000 bp', 'Over 5000 bp', 'Over 10000 bp',
                'Largest (bp)', 'Smallest (bp)', 'Average length (bp)',
                'Median (bp)', 'N50'
            ]
        }

    # Parse sequencefile
    if fastqfile:
        seqlengths = [
            len(s[0]) for s in HTSeq.FastqReader(seqfile, raw_iterator=True)
        ]
    else:
        seqlengths = [
            len(s[0]) for s in HTSeq.FastaReader(seqfile, raw_iterator=True)
        ]

    # Calculate statistcs
    table[header] = []
    table[header].append(len(seqlengths))
    table[header].append(sum(seqlengths))
    table[header].append(len([x for x in seqlengths if x > 100]))
    table[header].append(len([x for x in seqlengths if x > 500]))
    table[header].append(len([x for x in seqlengths if x > 1000]))
    table[header].append(len([x for x in seqlengths if x > 5000]))
    table[header].append(len([x for x in seqlengths if x > 10000]))
    table[header].append(max(seqlengths))
    table[header].append(min(seqlengths))
    table[header].append(np.mean(seqlengths))
    table[header].append(calculate_n50(seqlengths))

    # Create historgram data
    hist, edges = np.histogram(seqlengths,
                               density=False,
                               bins=int(max(seqlengths) / 10))
    return (table, hist.tolist(), edges.tolist())
def readFile(filename, fileType):
    """

    :rtype : return type is DNA Sequence as a list of characters
    """
    fasta_file = ""  #dummy initialization
    if (fileType == FASTA):
        fasta_file = HTSeq.FastaReader(filename)
    elif (fileType == FASTQ):
        fasta_file = HTSeq.FastqReader(filename)
    sequence = ""
    for read in fasta_file:
        sequence += read.seq
    return (map(lambda x: x.upper(), list(sequence)))
def extract_exons(fasta_fname, gff_fname):
    sequences = HTSeq.FastaReader(fasta_fname)
    # end_included=True as (exon.end - exon.start) % 3 = 2.
    gff = HTSeq.GFF_Reader(gff_fname, end_included=True)

    features = defaultdict(lambda: defaultdict(list))
    for feat in gff:
        features[feat.name][feat.type].append(feat)

    for kog, feats in features.items():
        exons = feats['Exon']
        exons = sorted(exons, key=lambda e: e.iv.start)
        seq = ''.join([str(sequences[exon.iv]) for exon in exons])
        binf.write_fasta_seq(sys.stdout, kog, seq)