예제 #1
0
def test_prune_short():
    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = physcraper.ConfigObj(configfi, interactive=False)
    ids = physcraper.IdDicts(conf, workdir=workdir)

    if os.path.exists(otu_jsonfi):
        otu_json = json.load(open(otu_jsonfi))
    else:
        otu_json = physcraper.OtuJsonDict(id_to_spn, ids)
        json.dump(otu_json, open(otu_jsonfi, "w"))

    data_obj = physcraper.generate_ATT_from_files(seqaln=seqaln,
                                                  mattype=mattype,
                                                  workdir=workdir,
                                                  treefile=treefile,
                                                  schema_trf=schema_trf,
                                                  otu_json=otu_jsonfi,
                                                  ingroup_mrca=None)

    len_before = len(data_obj.tre.taxon_namespace)
    data_obj.prune_short(0.9)
    len_after = len(data_obj.tre.taxon_namespace)
    assert len_before > len_after
예제 #2
0
def test_prune_short():
    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = physcraper.ConfigObj(configfi, interactive=False)
    conf.blast_loc = 'remote'  #saves time over loading names and nodes, and they aren't used here

    ids = physcraper.IdDicts(conf, workdir=workdir)

    if os.path.exists(otu_jsonfi):
        otu_json = json.load(open(otu_jsonfi))
    else:
        otu_json = physcraper.OtuJsonDict(id_to_spn, ids)
        json.dump(otu_json, open(otu_jsonfi, "w"))

    data_obj = physcraper.generate_ATT_from_files(seqaln=seqaln,
                                                  mattype=mattype,
                                                  workdir=workdir,
                                                  config_obj=conf,
                                                  treefile=treefile,
                                                  schema_trf=schema_trf,
                                                  otu_json=otu_jsonfi,
                                                  ingroup_mrca=None)

    data_obj.config.seq_len_perc = 0.9
    len_before = len(data_obj.tre.taxon_namespace)
    data_obj.prune_short()
    len_after = len(data_obj.tre.taxon_namespace)
    assert len_before > len_after
예제 #3
0
def test_owndata():
    seqaln = "tests/data/tiny_test_example/test.fas"
    mattype = "fasta"
    trfn = "tests/data/tiny_test_example/test.tre"
    schema_trf = "newick"
    workdir = "tests/output/owndata"
    configfi = "tests/data/localblast.config"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)
    """Tests if your own input files will generate a data object of class AlignTreeTax
	"""

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi)
    ids = IdDicts(conf, workdir=workdir)

    if os.path.exists(otu_jsonfi):
        print("load json")
        otu_json = json.load(open(otu_jsonfi))
    else:
        otu_json = OtuJsonDict(id_to_spn, ids)
        json.dump(otu_json, open(otu_jsonfi, "w"))

    data_obj = generate_ATT_from_files(seqaln=seqaln,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=trfn,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi,
                                       ingroup_mrca=None)

    assert isinstance(data_obj, AlignTreeTax)
예제 #4
0
def test_trim():
  #------------------------
  seqaln= "tests/data/tiny_test_example/test_extralongseq.fas"
  mattype="fasta"
  treefile= "tests/data/tiny_test_example/test.tre"
  schema_trf = "newick"
  workdir="tests/output/test_trim"
  configfi = "tests/data/test.config"
  id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
  otu_jsonfi = "{}/otu_dict.json".format(workdir)



  if not os.path.exists("{}".format(workdir)):
          os.makedirs("{}".format(workdir))

  conf = ConfigObj(configfi, interactive=False)
  ids = IdDicts(conf, workdir=workdir)

  if os.path.exists(otu_jsonfi):
      print("load json")
      otu_json = json.load(open(otu_jsonfi))
  else:
      otu_json = OtuJsonDict(id_to_spn, ids)
      json.dump(otu_json, open(otu_jsonfi,"w"))


  data_obj = generate_ATT_from_files(seqaln=seqaln, 
                                   mattype=mattype, 
                                   workdir=workdir,
                                   config_obj=conf,
                                   treefile=treefile,
                                   schema_trf = schema_trf,
                                   otu_json=otu_jsonfi,
                                   ingroup_mrca=None)

  for tax, seq in data_obj.aln.items():
  	len_start = len(seq)
  	next
  data_obj.trim()
  for tax, seq in data_obj.aln.items():
  	len_end = len(seq)

  assert len_start != len_end
예제 #5
0
def load_own_data(conf, seqaln, mattype, trfn, schema_trf, workdir,
                  ingroup_mrca):
    """
    Generates ATT object from own data.

    :param conf: conf object from physcraper
    :param seqaln: sequence alignment file
    :param mattype: format of sequence alignment
    :param trfn: tree file
    :param schema_trf: format of tree file
    :param workdir: working directory
    :param ingroup_mrca: mrca of ingroup as OTT ID
    :return: ATT object
    """
    otu_jsonfi = "{}/otu_dict.json".format(workdir)
    assert os.path.exists(otu_jsonfi)

    if os.path.isfile("{}/att_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: ATT\n")
        data_obj = pickle.load(
            open("{}/att_checkpoint.p".format(workdir), "rb"))
    else:
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_files(seqaln=seqaln,
                                           mattype=mattype,
                                           workdir=workdir,
                                           config_obj=conf,
                                           treefile=trfn,
                                           schema_trf=schema_trf,
                                           otu_json=otu_jsonfi,
                                           ingroup_mrca=ingroup_mrca)

        # Prune sequences below a certain length threshold
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName")
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()
    assert isinstance(data_obj, AlignTreeTax)
    return data_obj
예제 #6
0
def test():

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi, interactive=False)
    ids = IdDicts(conf, workdir=workdir)

    otu_json = OtuJsonDict(id_to_spn, ids)
    json.dump(otu_json, open(otu_jsonfi, "w"))

    data_obj = generate_ATT_from_files(seqaln=seqaln,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=treefile,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi,
                                       ingroup_mrca=None)

    for tax, seq in data_obj.aln.items():
        len_start = len(seq)

    data_obj.trim()

    for tax, seq in data_obj.aln.items():
        len_end = len(seq)

    assert len_start == len_end

    for tax, seq in data_obj.aln.items():
        len_start = len(seq)

    data_obj.config.trim_perc = 0.5
    data_obj.trim()

    for tax, seq in data_obj.aln.items():
        len_end = len(seq)

    assert len_start > len_end
예제 #7
0
def test_generate_ATT_from_file():

    seqaln = "tests/data/input.fas"
    mattype = "fasta"
    workdir = "tests/fromfile"
    treefile = "tests/data/input.tre"
    otu_jsonfi = "tests/data/otu_dict.json"
    schema_trf = "newick"
    configfi = "tests/data/test.config"

    sys.stdout.write("\nTesting 'generate_ATT_from_files (fromfile.py)'\n")

    conf = ConfigObj(configfi, interactive=False)

    data_obj = generate_ATT_from_files(seqaln=seqaln,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=treefile,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi)

    data_obj == True
예제 #8
0
def test_reconcile():
    #------------------------
    seqaln = "tests/data/tiny_test_example/test.fas"
    seqalnmiss = "tests/data/tiny_test_example/test_missingseq.fas"
    mattype = "fasta"
    treefile = "tests/data/tiny_test_example/test.tre"
    treefilemiss = "tests/data/tiny_test_example/test_missingtip.tre"
    schema_trf = "newick"
    workdir = "tests/output/owndata"
    configfi = "example.config"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    otu_jsonfi = "tests/data/tmp/owndata/otu_dict.json".format(workdir)

    conf = ConfigObj(configfi, interactive=False)

    data_obj = generate_ATT_from_files(seqaln=seqalnmiss,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=treefile,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi,
                                       ingroup_mrca=None)

    for otu in data_obj.otu_dict:
        if data_obj.otu_dict[otu][u'^ot:originalLabel'] == '2029_doronicum':
            assert data_obj.otu_dict[otu][
                '^physcraper:status'] == "deleted in reconciliation"

    #----------------------------------------------------

    data_obj = generate_ATT_from_files(seqaln=seqaln,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=treefilemiss,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi,
                                       ingroup_mrca=None)

    for otu in data_obj.otu_dict:
        if data_obj.otu_dict[otu][u'^ot:originalLabel'] == 'S_scopolii':
            assert data_obj.otu_dict[otu][
                '^physcraper:status'] == "deleted in reconciliation"

    #----------------------------------------------------

    aln = DnaCharacterMatrix.get(path=seqalnmiss, schema=mattype)

    assert aln.taxon_namespace
    for tax in aln.taxon_namespace:
        tax.label = tax.label.replace(
            " ", "_")  # Forcing all spaces to underscore UGH

    tre = Tree.get(path=treefile,
                   schema="newick",
                   preserve_underscores=True,
                   taxon_namespace=aln.taxon_namespace)

    assert aln.taxon_namespace == tre.taxon_namespace
    assert aln.taxon_namespace is tre.taxon_namespace

    treed_taxa = set()
    for leaf in tre.leaf_nodes():
        treed_taxa.add(leaf.taxon)
    aln_tax = set()
    for tax, seq in aln.items():
        aln_tax.add(tax)

    prune = treed_taxa ^ aln_tax

    assert len(prune) == 1
    assert list(prune)[0].label == '2029_doronicum'

    #----------------

    aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)

    assert aln.taxon_namespace
    for tax in aln.taxon_namespace:
        tax.label = tax.label.replace(
            " ", "_")  # Forcing all spaces to underscore UGH

    tre = Tree.get(path=treefilemiss,
                   schema="newick",
                   preserve_underscores=True,
                   taxon_namespace=aln.taxon_namespace)

    assert aln.taxon_namespace == tre.taxon_namespace
    assert aln.taxon_namespace is tre.taxon_namespace

    treed_taxa = set()
    for leaf in tre.leaf_nodes():
        treed_taxa.add(leaf.taxon)
    aln_tax = set()
    for tax, seq in aln.items():
        aln_tax.add(tax)

    prune = treed_taxa ^ aln_tax

    assert len(prune) == 1
    assert list(prune)[0].label == 'S_scopolii'

    # ----------------------------

    seqaln = "tests/data/tiny_test_example/test.fas"
    seqalnmiss = "tests/data/tiny_test_example/test_missingseq.fas"
    mattype = "fasta"
    treefile = "tests/data/tiny_test_example/test.tre"
    treefilemiss = "tests/data/tiny_test_example/test_missingtip.tre"
    schema_trf = "newick"
    workdir = "tests/output/owndata"
    configfi = "example.config"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    otu_jsonfi = "tests/data/tmp/owndata/otu_dict.json".format(workdir)

    data_obj = generate_ATT_from_files(seqaln=seqalnmiss,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=treefilemiss,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi,
                                       ingroup_mrca=None)

    for otu in data_obj.otu_dict:
        if data_obj.otu_dict[otu][u'^ot:originalLabel'] == '2029_doronicum':
            assert data_obj.otu_dict[otu][
                '^physcraper:status'] == "deleted in reconciliation"

    for otu in data_obj.otu_dict:
        if data_obj.otu_dict[otu][u'^ot:originalLabel'] == 'S_scopolii':
            assert data_obj.otu_dict[otu][
                '^physcraper:status'] == "deleted in reconciliation"
예제 #9
0
    os.makedirs("{}".format(workdir))

conf = ConfigObj(configfi, interactive=True)
ids = IdDicts(conf, workdir=workdir)

otu_json = OtuJsonDict(id_to_spn, ids)
with open(otu_jsonfi, "w") as outfile:
    json.dump(otu_json, outfile)

ottids = [otu_json[ite]['^ot:ottId'] for ite in otu_json]
mrca = opentree_helpers.get_mrca_ott(ottids)

data_obj = generate_ATT_from_files(seqaln=seqaln,
                                   mattype=mattype,
                                   workdir=workdir,
                                   config_obj=conf,
                                   treefile=trfn,
                                   schema_trf=schema_trf,
                                   otu_json=otu_jsonfi,
                                   ingroup_mrca=mrca)

data_obj.prune_short()
data_obj.dump(filename="tests/data/precooked/tiny_dataobj.p")

scraper = PhyscraperScrape(data_obj, ids)

scraper._blasted = 1
scraper.read_blast_wrapper(
    blast_dir="tests/data/precooked/fixed/tte_blast_files")

pickle.dump(ids.acc_ncbi_dict, open("tests/data/precooked/tiny_acc_map.p",
                                    "wb"))
예제 #10
0
def run_with_settings(settings):
    """looks for pickeled file to continue run, or builds and runs
    new analysis for as long as new seqs are found. 
    This uses the FilterBlast subclass to be able to filter the blast output."""
    debug("Debugging mode is on")
    if os.path.isfile("{}/scrape_checkpoint.p".format(settings.workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: scrape\n")
        filteredScrape = pickle.load(
            open("{}/scrape_checkpoint.p".format(settings.workdir), "rb")
        )
        filteredScrape.repeat = 1
    else:
        conf = ConfigObj(settings.configfi)
        # print("config")
        debug(dir(conf))
        debug(conf.email)

        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_files(seqaln=settings.seqaln, 
                                           mattype=settings.mattype,
                                           workdir=settings.workdir,
                                           treefile=settings.trfn,
                                           schema_trf=settings.schema_trf,
                                           otu_json=settings.spInfoDict,
                                           ingroup_mrca=None)

        # Prune sequences below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()

        data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()

        ids = IdDicts(conf, workdir=settings.workdir)

        filteredScrape = FilterBlast(data_obj, ids, settings)
        filteredScrape.add_setting_to_self(settings.downtorank, settings.threshold)

        filteredScrape.write_otu_info(settings.downtorank)

        if settings.add_unpubl_seq is not None:
            filteredScrape.unpublished = True
        if filteredScrape.unpublished is True:  # use unpublished data
            sys.stdout.write("Blasting against local unpublished data")
            filteredScrape.write_unpubl_blastdb(settings.add_unpubl_seq)
            filteredScrape.run_blast_wrapper(settings.delay)
            filteredScrape.local_otu_json = settings.id_to_spn_addseq_json
            filteredScrape.read_blast_wrapper()
            filteredScrape.remove_identical_seqs()
            filteredScrape.generate_streamed_alignment()
            filteredScrape.unpublished = False

        # run the ananlyses
        if filteredScrape.unpublished is not True:
            filteredScrape.run_blast_wrapper(settings.delay)
            filteredScrape.read_blast_wrapper(blast_dir=settings.shared_blast_folder)
            filteredScrape.remove_identical_seqs()
            filteredScrape.dump()
            if settings.threshold is not None:
                filteredScrape.sp_dict(settings.downtorank)
                filteredScrape.make_sp_seq_dict()
                filteredScrape.how_many_sp_to_keep(threshold=settings.threshold, selectby=settings.selectby)
                filteredScrape.replace_new_seq()
            debug("from replace to streamed aln")
            filteredScrape.generate_streamed_alignment()
            filteredScrape.dump()
    while filteredScrape.repeat is 1:
        filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        filteredScrape.data.write_otus("otu_info", schema="table")
        filteredScrape.run_blast_wrapper(settings.delay)
        filteredScrape.read_blast_wrapper(blast_dir=settings.shared_blast_folder)
        filteredScrape.remove_identical_seqs()
        if settings.threshold is not None:
            filteredScrape.sp_dict(settings.downtorank)
            filteredScrape.make_sp_seq_dict()
            filteredScrape.how_many_sp_to_keep(threshold=settings.threshold, selectby=settings.selectby)
            filteredScrape.replace_new_seq()
        filteredScrape.generate_streamed_alignment()
        filteredScrape.dump()
        filteredScrape.write_otu_info(settings.downtorank)
        return filteredScrape
예제 #11
0
def filter_data_run(seqaln,
                    mattype,
                    trfn,
                    schema_trf,
                    workdir,
                    threshold,
                    spInfoDict,
                    configfi,
                    selectby="blast",
                    downtorank=None,
                    blacklist=None,
                    add_unpubl_seq=None,
                    id_to_spn_addseq_json=None,
                    ingroup_mrca=None,
                    shared_blast_folder=None):
    """looks for pickeled file to continue run, or builds and runs 
    new analysis for as long as new seqs are found. 
    This uses the FilterBlast subclass to be able to filter the blast output.
    """
    debug("Debugging mode is on")

    # debug(shared_blast_folder)
    # debug(some)
    # if _DEBUG_MK == 1:
    #     random.seed(3269235691)
    print(workdir)
    if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: scrape\n")
        filteredScrape = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), 'rb'))
        filteredScrape.repeat = 1   
    else:   
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=True)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_files(seqaln=seqaln, 
                                           mattype=mattype,
                                           workdir=workdir,
                                           treefile=trfn,
                                           schema_trf=schema_trf,
                                           otu_json=spInfoDict,
                                           ingroup_mrca=ingroup_mrca)

        # Prune sequnces below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()
        sys.stdout.write("setting up id dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir, mrca=ingroup_mrca)

        # Now combine the data, the ids, and the configuration into a single physcraper scrape object
        filteredScrape = FilterBlast(data_obj, ids)
        filteredScrape.add_setting_to_self(downtorank, threshold)
        filteredScrape.blacklist = blacklist
        if add_unpubl_seq is not None:
            filteredScrape.unpublished = True
        if filteredScrape.unpublished is True:  # use unpublished data
            sys.stdout.write("Blasting against local unpublished data")
            filteredScrape.unpublished = True
            filteredScrape.write_unpubl_blastdb(add_unpubl_seq)
            filteredScrape.run_blast_wrapper(delay=14)
            print("add unpubl otu json")
            filteredScrape.data.unpubl_otu_json = id_to_spn_addseq_json
            print(filteredScrape.data.unpubl_otu_json)

            filteredScrape.read_blast_wrapper()
            filteredScrape.remove_identical_seqs()
            filteredScrape.generate_streamed_alignment()
            filteredScrape.unpublished = False
        else:
            # run the analysis
            sys.stdout.write("BLASTing input sequences\n")
            if shared_blast_folder:
                filteredScrape.blast_subdir = shared_blast_folder
            else:
                shared_blast_folder = None
            filteredScrape.run_blast_wrapper(delay=14)
            filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder)
            filteredScrape.remove_identical_seqs()
            filteredScrape.dump()
            sys.stdout.write("Filter the sequences\n")
            if threshold is not None:

                filteredScrape.sp_dict(downtorank)
                filteredScrape.make_sp_seq_dict()
                filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby)
                filteredScrape.replace_new_seq()
            sys.stdout.write("Calculate the phylogeny\n")
            filteredScrape.generate_streamed_alignment()
            filteredScrape.data.write_otus("otu_info", schema="table")
            filteredScrape.write_otu_info(downtorank)

            filteredScrape.dump()
    while filteredScrape.repeat == 1:
        filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        filteredScrape.data.write_otus("otu_info", schema="table")
        sys.stdout.write("BLASTing input sequences\n")
        if shared_blast_folder:
            filteredScrape.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        filteredScrape.run_blast_wrapper(delay=14)
        filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder)
        filteredScrape.remove_identical_seqs()
        sys.stdout.write("Filter the sequences\n")
        if threshold is not None:
            filteredScrape.sp_dict(downtorank)
            filteredScrape.make_sp_seq_dict()
            filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby)
            filteredScrape.replace_new_seq()
        filteredScrape.data.prune_short(0.75)
        sys.stdout.write("calculate the phylogeny\n")
        filteredScrape.generate_streamed_alignment()
        filteredScrape.dump()
        filteredScrape.write_otu_info(downtorank)
        # print(some)
    filteredScrape.write_otu_info(downtorank)
    return filteredScrape
예제 #12
0
def own_data_run(seqaln,
                 mattype,
                 trfn,
                 schema_trf,
                 workdir,
                 sp_info_jsonfi,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """This is the wrapper function to start a PhyScraper run with your own data.
    You need:
         seqaln = path to sequence alignment file
         mattype = the format name of you alignment
         trfn = path to file with the phylogeny to update
         schema_trf = format type of your phylogeny
         workdir = define where your analysis files shall be stored
         sp_info_jsonfi = a json file which has the otu_dict stored, which is generated by the OtuJsonDict function
                            (usually, just leave it like it is in the example scripts.).
         configfi = path to your config file
         ingroup_mrca = not necessary, if you want to limit your run to a certain clade, give the OpenTree ID here,
                        can be obtained bu running: python scripts/get_ott.py ingroup_name
         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """

    debug("Debugging mode is on")

    if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: ATT\n")
        scraper = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), "rb"))
        scraper.repeat = 1
    else:
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=False)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_files(seqaln=seqaln, 
                                           mattype=mattype,
                                           workdir=workdir,
                                           treefile=trfn,
                                           schema_trf=schema_trf,
                                           otu_json=sp_info_jsonfi,
                                           ingroup_mrca=ingroup_mrca)

        # Prune sequences below a certain length threshold
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName")
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()

        sys.stdout.write("setting up ID dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir)
        scraper = PhyscraperScrape(data_obj, ids)
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        # run the analyses
        scraper.run_blast_wrapper(delay=14)
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    while scraper.repeat == 1:
        scraper.run_blast_wrapper(delay=14)
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    return 1