Exemplo n.º 1
0
        print("Error: imgt-data.fasta file not detected for\'", species + \
                "'. Please generate and place it in the appropriate Data subdirectory.")
        sys.exit()

    # If so, check the modification time for the imgt-data.fasta file, assuming that's the last download time
    input_imgt_file = species_dir + 'imgt-data.fasta'
    mod_date = datetime.fromtimestamp(
        os.path.getmtime(input_imgt_file)).strftime('%Y-%m-%d')

    # Then read through the FASTA and sort into the appropriate chains
    with open(input_imgt_file, 'rU') as in_file, \
            open(species_dir + 'TRA.fasta', 'w') as TRA, \
            open(species_dir + 'TRB.fasta', 'w') as TRB:

        prot = coll.defaultdict(coll.defaultdict)

        for fasta_id, seq, blank in fxn.read_fa(in_file):
            gene, allele = fasta_id.split('|')[1].split('*')

            # NB: TRDV included with TRA genes due to the evidence that even non 'TRAV/DV' genes can recombine with TRAJ
            if 'TRA' in gene or 'TRDV' in gene:
                TRA.write(fxn.fastafy(fasta_id, seq))
            elif 'TRB' in gene:
                TRB.write(fxn.fastafy(fasta_id, seq))

    # Finally log the dates
    log_txt = 'imgt-data.fasta_last_modified ' + mod_date + '\nsplit-imgt-data.py_last_run ' + fxn.today(
    )
    with open(species_dir + 'data-production-date.txt', 'w') as log_file:
        log_file.write(log_txt)
    fxn.check_scripts_dir()
    sns.set(font="Arial", font_scale=1.5)

    # Sort directories, get data
    plot_dir = fxn.plot_dir + fxn.get_date() + '-mouse-proteome-check/'
    if not os.path.exists(plot_dir):
        os.mkdir(plot_dir)

    # Read proteome into dict
    mouse_proteome_file = [
        x for x in os.listdir(fxn.base_data_dir) if '_mouse.fasta' in x
    ][0]

    mouse_proteins = coll.defaultdict()
    with gzip.open(fxn.base_data_dir + mouse_proteome_file, 'rU') as in_file:
        for protein, seq, blank in fxn.read_fa(in_file):
            mouse_proteins[protein.split(' ')[0]] = seq

    # Then scroll through non-predicted binder files, build an AC trie of all the peptides per file
    data_dir = '../Data/NonPredictedBinders/'
    matches = coll.defaultdict(fxn.nest_counter)
    all_peptides = coll.defaultdict(list)
    for f in [x for x in os.listdir(data_dir) if x.endswith('.txt')]:
        nam = f.split('-')[0]
        search_builder = AcoraBuilder()
        peptides = []

        # Build trie
        with open(data_dir + f, 'rU') as in_file:
            for line in in_file:
                search_builder.add(line.rstrip())