def concat_sequences(file1, file2, file3, file4, file5, file6, file7, file8, file9, file10): sequences1 = AlignIO.read(file1, 'fasta') sequences2 = AlignIO.read(file2, 'fasta') sequences3 = AlignIO.read(file3, 'fasta') sequences4 = AlignIO.read(file4, 'fasta') sequences5 = AlignIO.read(file5, 'fasta') sequences6 = AlignIO.read(file6, 'fasta') sequences7 = AlignIO.read(file7, 'fasta') sequences8 = AlignIO.read(file8, 'fasta') sequences9 = AlignIO.read(file9, 'fasta') sequences10 = AlignIO.read(file10, 'fasta') complete_sequences = [] for sequence1 in sequences1: strain_name = util.get_strain_name(sequence1) sequence2 = util.get_matching_sequence(sequences2, strain_name=strain_name) sequence3 = util.get_matching_sequence(sequences3, strain_name=strain_name) sequence4 = util.get_matching_sequence(sequences4, strain_name=strain_name) sequence5 = util.get_matching_sequence(sequences5, strain_name=strain_name) sequence6 = util.get_matching_sequence(sequences6, strain_name=strain_name) sequence7 = util.get_matching_sequence(sequences7, strain_name=strain_name) sequence8 = util.get_matching_sequence(sequences8, strain_name=strain_name) sequence9 = util.get_matching_sequence(sequences9, strain_name=strain_name) sequence10 = util.get_matching_sequence(sequences10, strain_name=strain_name) if (sequence2 and sequence3 and sequence4 and sequence5 and sequence6 and sequence7 and sequence8 and sequence9 and sequence10): complete_sequence=[] complete_sequence.append(util.get_strain_name(sequence1)) complete_sequence.append(sequence1.seq+sequence2.seq+sequence3.seq+sequence4.seq+sequence5.seq+sequence6.seq+sequence7.seq+sequence8.seq+sequence9.seq+sequence10.seq) complete_sequences.append(complete_sequence) return complete_sequences
def concat_sequences(file1, file2, file3, file4, file5, file6, file7, file8, file9, file10): sequences1 = AlignIO.read(file1, 'fasta') sequences2 = AlignIO.read(file2, 'fasta') sequences3 = AlignIO.read(file3, 'fasta') sequences4 = AlignIO.read(file4, 'fasta') sequences5 = AlignIO.read(file5, 'fasta') sequences6 = AlignIO.read(file6, 'fasta') sequences7 = AlignIO.read(file7, 'fasta') sequences8 = AlignIO.read(file8, 'fasta') sequences9 = AlignIO.read(file9, 'fasta') sequences10 = AlignIO.read(file10, 'fasta') complete_sequences = [] for sequence1 in sequences1: strain_name = util.get_strain_name(sequence1) sequence2 = util.get_matching_sequence(sequences2, strain_name=strain_name) sequence3 = util.get_matching_sequence(sequences3, strain_name=strain_name) sequence4 = util.get_matching_sequence(sequences4, strain_name=strain_name) sequence5 = util.get_matching_sequence(sequences5, strain_name=strain_name) sequence6 = util.get_matching_sequence(sequences6, strain_name=strain_name) sequence7 = util.get_matching_sequence(sequences7, strain_name=strain_name) sequence8 = util.get_matching_sequence(sequences8, strain_name=strain_name) sequence9 = util.get_matching_sequence(sequences9, strain_name=strain_name) sequence10 = util.get_matching_sequence(sequences10, strain_name=strain_name) if (sequence2 and sequence3 and sequence4 and sequence5 and sequence6 and sequence7 and sequence8 and sequence9 and sequence10): complete_sequence = [] complete_sequence.append(util.get_strain_name(sequence1)) complete_sequence.append(sequence1.seq + sequence2.seq + sequence3.seq + sequence4.seq + sequence5.seq + sequence6.seq + sequence7.seq + sequence8.seq + sequence9.seq + sequence10.seq) complete_sequences.append(complete_sequence) return complete_sequences
def combine_2_fastas(file1, file2): sequences1 = AlignIO.read(file1, 'fasta') sequences2 = AlignIO.read(file2, 'fasta') records = [] seq_id = 1 for sequence1 in sequences1: strain_name = util.get_strain_name(sequence1) sequence2 = util.get_matching_sequence(sequences2, strain_name=strain_name) if sequence2 is not None: records.append( SeqRecord(Seq(str(sequence1.seq + sequence2.seq)), id=str(seq_id), name='HA_NA', description='HA_NA')) seq_id += 1 print(len(records)) with open("ha_na.fasta", "w") as handle: SeqIO.write(records, handle, "fasta") handle.close()
def update_year_counts(record): strain_name = util.get_strain_name(record) year = strain_name.split('/')[-1] if year in strains_by_year: strains_by_year[year].append(strain_name) else: strains_by_year[year] = [strain_name]
def create_unique_sequences(fasta_file, unique_strains_file): sequences = AlignIO.read(fasta_file, 'fasta') strains = list(util.read_file(unique_strains_file))[0] new_fasta_sequences = [] for sequence in sequences: strain_name = util.get_strain_name(sequence) if strain_name in strains: new_fasta_sequences.append(sequence) strains.remove(strain_name) return new_fasta_sequences
def get_strain_names(fasta_file, k, max_year): strain_names = [] strains_by_year = {} for record in SeqIO.parse(fasta_file, "fasta"): description_list = record.description.split('|') if (len(description_list)<4): print("protein description too short ", record.description) continue if 'HA' in description_list[5]: strain_name = gau.get_strain_name(record) year = strain_name.split('/')[-1] if year in strains_by_year: strains_by_year[year].append(strain_name) else: strains_by_year[year] = [strain_name] # what are the counts per year? for key, value in strains_by_year.items(): print(key, len(value)) for key, value in strains_by_year.items(): try: year = int(key) except Exception: year = 0 if year < max_year: print(key + ' is less than ' + str(max_year)) if len(value) > k: small_list = random.sample(value, k=k) for strain_name in small_list: strain_names.append(strain_name) else: for strain_name in value: strain_names.append(strain_name) else: print(key + ' is greater than ' + str(max_year)) return strain_names
def get_strain_names(fasta_file, k, max_year): strain_names = [] strains_by_year = {} for record in SeqIO.parse(fasta_file, "fasta"): description_list = record.description.split('|') if (len(description_list) < 4): print("protein description too short ", record.description) continue if 'HA' in description_list[5]: strain_name = gau.get_strain_name(record) year = strain_name.split('/')[-1] if year in strains_by_year: strains_by_year[year].append(strain_name) else: strains_by_year[year] = [strain_name] # what are the counts per year? for key, value in strains_by_year.items(): print(key, len(value)) for key, value in strains_by_year.items(): try: year = int(key) except Exception: year = 0 if year < max_year: print(key + ' is less than ' + str(max_year)) if len(value) > k: small_list = random.sample(value, k=k) for strain_name in small_list: strain_names.append(strain_name) else: for strain_name in value: strain_names.append(strain_name) else: print(key + ' is greater than ' + str(max_year)) return strain_names
def custom_split(): fasta_file = sys.argv[1] strain_names = get_strain_names(fasta_file, k=100, max_year=2020) ha_sequences = [] na_sequences = [] np_sequences = [] m1_sequences = [] m2_sequences = [] pb1_sequences = [] pb2_sequences = [] ns1_sequences = [] ns2_sequences = [] pa_sequences = [] for record in SeqIO.parse(fasta_file, "fasta"): description_list = record.description.split('|') if (len(description_list) < 4): print("protein description too short ", record.description) continue if 'HA' in description_list[5]: strain_name = gau.get_strain_name(record) if strain_name in strain_names: ha_sequences.append(record) if 'NA' in description_list[5]: strain_name = gau.get_strain_name(record) if strain_name in strain_names: na_sequences.append(record) if 'NP' in description_list[5]: strain_name = gau.get_strain_name(record) if strain_name in strain_names: np_sequences.append(record) if 'M1' in description_list[5]: strain_name = gau.get_strain_name(record) if strain_name in strain_names: m1_sequences.append(record) if 'M2' in description_list[5]: strain_name = gau.get_strain_name(record) if strain_name in strain_names: m2_sequences.append(record) if 'PB1 Polymerase (basic) protein 1' in description_list[5] or 'PB1 Polymerase (basic) protein 1' in \ description_list[4]: strain_name = gau.get_strain_name(record) if strain_name in strain_names: pb1_sequences.append(record) if 'PB2' in description_list[5]: strain_name = gau.get_strain_name(record) if strain_name in strain_names: pb2_sequences.append(record) if 'NS1 Non-structural protein 1' in description_list[5]: strain_name = gau.get_strain_name(record) if strain_name in strain_names: ns1_sequences.append(record) if 'NS2 Non-structural protein 2' in description_list[5]: strain_name = gau.get_strain_name(record) if strain_name in strain_names: ns2_sequences.append(record) if 'PA Polymerase (acidic) protein' in description_list[5]: strain_name = gau.get_strain_name(record) if strain_name in strain_names: pa_sequences.append(record) output_handle = open("ha.fasta", "w") print("length of ha sequences ", len(ha_sequences)) SeqIO.write(ha_sequences, output_handle, "fasta") output_handle.close() output_handle = open("na.fasta", "w") print("length of na sequences ", len(na_sequences)) SeqIO.write(na_sequences, output_handle, "fasta") output_handle.close() output_handle = open("m1.fasta", "w") print("length of m1 sequences ", len(m1_sequences)) SeqIO.write(m1_sequences, output_handle, "fasta") output_handle.close() output_handle = open("m2.fasta", "w") print("length of m2 sequences ", len(m2_sequences)) SeqIO.write(m2_sequences, output_handle, "fasta") output_handle.close() output_handle = open("np.fasta", "w") print("length of np sequences ", len(np_sequences)) SeqIO.write(np_sequences, output_handle, "fasta") output_handle.close() output_handle = open("pb1.fasta", "w") print("length of pb1 sequences ", len(pb1_sequences)) SeqIO.write(pb1_sequences, output_handle, "fasta") output_handle.close() output_handle = open("pb2.fasta", "w") print("length of pb2 sequences ", len(pb2_sequences)) SeqIO.write(pb2_sequences, output_handle, "fasta") output_handle.close() output_handle = open("ns1.fasta", "w") print("length of ns1 sequences ", len(ns1_sequences)) SeqIO.write(ns1_sequences, output_handle, "fasta") output_handle.close() output_handle = open("ns2.fasta", "w") print("length of ns2 sequences ", len(ns2_sequences)) SeqIO.write(ns2_sequences, output_handle, "fasta") output_handle.close() output_handle = open("pa.fasta", "w") print("length of pa sequences ", len(pa_sequences)) SeqIO.write(pa_sequences, output_handle, "fasta") output_handle.close()