Exemplo n.º 1
0
def get_rask_var(raskFasta, contig_file, fileName, outdir, verbose):

    #run blast against the rask VAR genes
    blast_out = run_blast(raskFasta, contig_file, outdir, verbose)

    rask_hits = set()
    with open(blast_out, 'r') as blastfile:
        for line in blastfile:
            rask_hits.add(line.split()[0])

    rask = outdir + fileName + "_rask.fa"
    non_rask = outdir + fileName + "_nonrask.fa"

    count_rask = 0
    count_non_rask = 0

    with open(rask, 'w') as raskout:
        with open(non_rask, 'w') as nonraskout:
            for h, s in FastaReader(contig_file):
                if h in rask_hits:
                    raskout.write(">" + h + "\n")
                    raskout.write(s + "\n")
                    count_rask += 1
                else:
                    nonraskout.write(">" + h + "\n")
                    nonraskout.write(s + "\n")
                    count_non_rask += 1

    if verbose:
        print count_rask, " annotated to rask DB..."
        print count_non_rask, " remaining"

    return rask, non_rask
Exemplo n.º 2
0
def analyse_contigs(contig_file,
                    read1,
                    read2,
                    outputdir,
                    fasta_ref_files=[],
                    verbose=False):
    #first prepare blast files for analysis

    renamedContigs = reNameContigs(contig_file, outputdir)

    blast_files = []
    for reference in fasta_ref_files:
        blast_files.append(
            run_blast(reference, renamedContigs, outputdir, verbose))

    #now align reads to contigs
    samfile = align_w_subread(read1, read2, renamedContigs, outputdir, verbose)

    #now convert to bam
    bamfile = convert_to_bam_create_index(renamedContigs, samfile, verbose)

    # bamfile="notused"
    #now compute analytics
    outfile = generate_summary(bamfile, outputdir, renamedContigs, blast_files)

    return outfile
Exemplo n.º 3
0
def get_rask_var(raskFasta, contig_file, fileName, outdir, verbose):

  #run blast against the rask VAR genes
  blast_out = run_blast(raskFasta, contig_file, outdir, verbose)

  rask_hits = set()
  with open(blast_out , 'r') as blastfile:
      for line in blastfile:
          rask_hits.add(line.split()[0])

  rask = outdir + fileName + "_rask.fa"
  non_rask = outdir + fileName + "_nonrask.fa"

  count_rask = 0
  count_non_rask = 0

  with open(rask, 'w') as raskout:
      with open (non_rask, 'w') as nonraskout:
          for h,s in FastaReader(contig_file):
              if h in rask_hits:
                  raskout.write(">"+h+"\n")
                  raskout.write(s+"\n")
                  count_rask += 1
              else:
                  nonraskout.write(">"+h+"\n")
                  nonraskout.write(s+"\n")
                  count_non_rask +=1

  if verbose:
    print count_rask, " annotated to rask DB..."
    print count_non_rask, " remaining"

  return rask, non_rask
Exemplo n.º 4
0
def get_contaminants(fasta_ref_files, contig_file, fileName, percent_overlap
    , outdir, verbose):
  #first get list of contigs
  contigs = {}
  for h,s in FastaReader(contig_file):
      contigs[h] = s

  if verbose:
    print ("Number of contigs before contaminant filtering: "
      , len(contigs.keys()))

  #now run blast against the reference files which we want not to be
  #present in the data i.e. human
  blast_files = []
  for reference in fasta_ref_files:
      blast_files.append(run_blast(reference, contig_file, outdir
          , verbose))

  #now iterate through blast results file removing contigs that have to
  #high a proportion of hits
  bad_contigs = set()
  for blast_file in blast_files:
      blast_name = os.path.splitext(os.path.basename(blast_file))[0]
      with open(blast_file, 'r') as bfile:
          for line in bfile:
              tokens = line.strip().split()
              name = tokens[0]
              overlap = int(tokens[3])/float(len(contigs[name]))
              if overlap > percent_overlap:
                  #we don't want this contig
                  print "removing", name, "overlapped", blast_name
                  bad_contigs.add(name)

  #now write out a fasta file of contaminant sequences
  contaminant_file = outdir + fileName + "contaminants.fa"
  with open(contaminant_file, 'w') as outfile:
    for contig in bad_contigs:
        outfile.write(">" + contig + "\n")
        outfile.write(contigs[contig] + "\n")

  #now write contigs without contaminants to a file
  non_contaminant_file = outdir + fileName + "Non_contaminants.fa"
  with open(non_contaminant_file, 'w') as outfile:
      for contig in contigs:
        if contig not in bad_contigs:
          outfile.write(">" + contig + "\n")
          outfile.write(contigs[contig] + "\n")

  if verbose:
    print ("Number of contigs after filtering: "
      , len(contigs.keys())-len(bad_contigs))

  return non_contaminant_file, contaminant_file
Exemplo n.º 5
0
def get_contaminants(fasta_ref_files, contig_file, fileName, percent_overlap,
                     outdir, verbose):
    #first get list of contigs
    contigs = {}
    for h, s in FastaReader(contig_file):
        contigs[h] = s

    if verbose:
        print("Number of contigs before contaminant filtering: ",
              len(contigs.keys()))

    #now run blast against the reference files which we want not to be
    #present in the data i.e. human
    blast_files = []
    for reference in fasta_ref_files:
        blast_files.append(run_blast(reference, contig_file, outdir, verbose))

    #now iterate through blast results file removing contigs that have to
    #high a proportion of hits
    bad_contigs = set()
    for blast_file in blast_files:
        blast_name = os.path.splitext(os.path.basename(blast_file))[0]
        with open(blast_file, 'r') as bfile:
            for line in bfile:
                tokens = line.strip().split()
                name = tokens[0]
                overlap = int(tokens[3]) / float(len(contigs[name]))
                if overlap > percent_overlap:
                    #we don't want this contig
                    print "removing", name, "overlapped", blast_name
                    bad_contigs.add(name)

    #now write out a fasta file of contaminant sequences
    contaminant_file = outdir + fileName + "contaminants.fa"
    with open(contaminant_file, 'w') as outfile:
        for contig in bad_contigs:
            outfile.write(">" + contig + "\n")
            outfile.write(contigs[contig] + "\n")

    #now write contigs without contaminants to a file
    non_contaminant_file = outdir + fileName + "Non_contaminants.fa"
    with open(non_contaminant_file, 'w') as outfile:
        for contig in contigs:
            if contig not in bad_contigs:
                outfile.write(">" + contig + "\n")
                outfile.write(contigs[contig] + "\n")

    if verbose:
        print("Number of contigs after filtering: ",
              len(contigs.keys()) - len(bad_contigs))

    return non_contaminant_file, contaminant_file
Exemplo n.º 6
0
def filter_ref_with_blast(fasta_ref_files, contig_file, percent_overlap
    , outfile, outdir):

    #first get list of contigs
    contigs = {}
    for h,s in FastaReader(contig_file):
        contigs[h] = s

    print "Number of contigs before filtering: ", len(contigs.keys())

    #now run blast against the reference files which we want not to be
    #present in the data i.e. human
    blast_files = []
    for reference in fasta_ref_files:
        blast_files.append(run_blast(reference, contig_file, outdir
            , True))

    #now iterate through blast results file removing contigs that have to
    #high a proportion of hits
    bad_contigs = set()
    for blast_file in blast_files:
        blast_name = os.path.splitext(os.path.basename(blast_file))[0]
        with open(blast_file, 'r') as bfile:
            for line in bfile:
                tokens = line.strip().split()
                name = tokens[0]
                overlap = int(tokens[3])/float(len(contigs[name]))
                if overlap > percent_overlap:
                    #we don't want this contig
                    print "removing", name, "overlapped", blast_name
                    bad_contigs.add(name)
    for name in bad_contigs:
        del contigs[name]

    #now write resulting contigs to a file
    with open(outfile, 'w') as outfas:
        for contig in contigs:
            outfas.write(">" + contig + "\n")
            outfas.write(contigs[contig] + "\n")

    print "Number of contigs after filtering: ", len(contigs.keys())
Exemplo n.º 7
0
def filter_ref_with_blast(fasta_ref_files, contig_file, percent_overlap,
                          outfile, outdir):

    #first get list of contigs
    contigs = {}
    for h, s in FastaReader(contig_file):
        contigs[h] = s

    print "Number of contigs before filtering: ", len(contigs.keys())

    #now run blast against the reference files which we want not to be
    #present in the data i.e. human
    blast_files = []
    for reference in fasta_ref_files:
        blast_files.append(run_blast(reference, contig_file, outdir, True))

    #now iterate through blast results file removing contigs that have to
    #high a proportion of hits
    bad_contigs = set()
    for blast_file in blast_files:
        blast_name = os.path.splitext(os.path.basename(blast_file))[0]
        with open(blast_file, 'r') as bfile:
            for line in bfile:
                tokens = line.strip().split()
                name = tokens[0]
                overlap = int(tokens[3]) / float(len(contigs[name]))
                if overlap > percent_overlap:
                    #we don't want this contig
                    print "removing", name, "overlapped", blast_name
                    bad_contigs.add(name)
    for name in bad_contigs:
        del contigs[name]

    #now write resulting contigs to a file
    with open(outfile, 'w') as outfas:
        for contig in contigs:
            outfas.write(">" + contig + "\n")
            outfas.write(contigs[contig] + "\n")

    print "Number of contigs after filtering: ", len(contigs.keys())
Exemplo n.º 8
0
def analyse_contigs(contig_file, read1, read2, outputdir
   , fasta_ref_files=[], verbose=False):
    #first prepare blast files for analysis

    renamedContigs = reNameContigs(contig_file, outputdir)

    blast_files = []
    for reference in fasta_ref_files:
        blast_files.append(run_blast(reference, renamedContigs, outputdir
            , verbose))

    #now align reads to contigs
    samfile = align_w_subread(read1, read2, renamedContigs, outputdir, verbose)

    #now convert to bam
    bamfile = convert_to_bam_create_index(renamedContigs, samfile, verbose)

    # bamfile="notused"
    #now compute analytics
    outfile = generate_summary(bamfile, outputdir
        , renamedContigs, blast_files)

    return outfile
Exemplo n.º 9
0
import sys, os
from third_party_runners import run_blast
from mungo.fasta import FastaReader

OVERLAP = 0.7
IDENTITY = 95

sequences = sys.argv[1]
outdir = sys.argv[2]


#first run all-vs-all blast
bfile = run_blast(sequences, sequences, outdir, True)

#now get the lengths of all the contigs
contig_len = {}
for h,s in FastaReader(sequences):
    contig_len[h]=len(s)

#now iterate through search finding redundant contigs
redundant_contigs = set()
with open(bfile, 'r') as blastsearch:
    for line in blastsearch:
        tokens = line.strip().split()
        if tokens[0]==tokens[1]: #matching itself
            continue
        if float(tokens[2]) >= IDENTITY:
            # print "identity ",tokens[2]
            if ((float(tokens[3])/contig_len[tokens[0]] >= OVERLAP) or 
                (float(tokens[3])/contig_len[tokens[1]] >= OVERLAP)):
                # print "overlap", max(float(tokens[3])/contig_len[tokens[0]],float(tokens[3])/contig_len[tokens[1]])
Exemplo n.º 10
0
import sys, os
from third_party_runners import run_blast
from mungo.fasta import FastaReader


contig_file = sys.argv[1]
blastdb_file = sys.argv[2]
outdir = sys.argv[3]


name = os.path.splitext(os.path.basename(contig_file))[0]

blast_out = run_blast(blastdb_file, contig_file, outdir, True)

rask_hits = set()
with open(blast_out , 'r') as blastfile:
    for line in blastfile:
        rask_hits.add(line.split()[0])

with open(outdir + name + "_rask.fa", 'w') as raskout:
    with open (outdir + name + "_nonrask.fa", 'w') as nonraskout:
        for h,s in FastaReader(contig_file):
            if h in rask_hits:
                raskout.write(">"+h+"\n")
                raskout.write(s+"\n")
            else:
                nonraskout.write(">"+h+"\n")
                nonraskout.write(s+"\n")


Exemplo n.º 11
0
import sys, os
from third_party_runners import run_blast
from mungo.fasta import FastaReader

contig_file = sys.argv[1]
blastdb_file = sys.argv[2]
outdir = sys.argv[3]

name = os.path.splitext(os.path.basename(contig_file))[0]

blast_out = run_blast(blastdb_file, contig_file, outdir, True)

rask_hits = set()
with open(blast_out, 'r') as blastfile:
    for line in blastfile:
        rask_hits.add(line.split()[0])

with open(outdir + name + "_rask.fa", 'w') as raskout:
    with open(outdir + name + "_nonrask.fa", 'w') as nonraskout:
        for h, s in FastaReader(contig_file):
            if h in rask_hits:
                raskout.write(">" + h + "\n")
                raskout.write(s + "\n")
            else:
                nonraskout.write(">" + h + "\n")
                nonraskout.write(s + "\n")