Exemplo n.º 1
0
def concat_sequences(file1, file2, file3, file4, file5, file6, file7, file8, file9, file10):

    sequences1 = AlignIO.read(file1, 'fasta')
    sequences2 = AlignIO.read(file2, 'fasta')
    sequences3 = AlignIO.read(file3, 'fasta')
    sequences4 = AlignIO.read(file4, 'fasta')
    sequences5 = AlignIO.read(file5, 'fasta')
    sequences6 = AlignIO.read(file6, 'fasta')
    sequences7 = AlignIO.read(file7, 'fasta')
    sequences8 = AlignIO.read(file8, 'fasta')
    sequences9 = AlignIO.read(file9, 'fasta')
    sequences10 = AlignIO.read(file10, 'fasta')

    complete_sequences = []

    for sequence1 in sequences1:
        strain_name = util.get_strain_name(sequence1)
        sequence2 = util.get_matching_sequence(sequences2, strain_name=strain_name)
        sequence3 = util.get_matching_sequence(sequences3, strain_name=strain_name)
        sequence4 = util.get_matching_sequence(sequences4, strain_name=strain_name)
        sequence5 = util.get_matching_sequence(sequences5, strain_name=strain_name)
        sequence6 = util.get_matching_sequence(sequences6, strain_name=strain_name)
        sequence7 = util.get_matching_sequence(sequences7, strain_name=strain_name)
        sequence8 = util.get_matching_sequence(sequences8, strain_name=strain_name)
        sequence9 = util.get_matching_sequence(sequences9, strain_name=strain_name)
        sequence10 = util.get_matching_sequence(sequences10, strain_name=strain_name)

        if (sequence2 and sequence3 and sequence4 and sequence5 and sequence6 and sequence7 and sequence8 and sequence9 and sequence10):
            complete_sequence=[]
            complete_sequence.append(util.get_strain_name(sequence1))
            complete_sequence.append(sequence1.seq+sequence2.seq+sequence3.seq+sequence4.seq+sequence5.seq+sequence6.seq+sequence7.seq+sequence8.seq+sequence9.seq+sequence10.seq)
            complete_sequences.append(complete_sequence)

    return complete_sequences
Exemplo n.º 2
0
def concat_sequences(file1, file2, file3, file4, file5, file6, file7, file8,
                     file9, file10):

    sequences1 = AlignIO.read(file1, 'fasta')
    sequences2 = AlignIO.read(file2, 'fasta')
    sequences3 = AlignIO.read(file3, 'fasta')
    sequences4 = AlignIO.read(file4, 'fasta')
    sequences5 = AlignIO.read(file5, 'fasta')
    sequences6 = AlignIO.read(file6, 'fasta')
    sequences7 = AlignIO.read(file7, 'fasta')
    sequences8 = AlignIO.read(file8, 'fasta')
    sequences9 = AlignIO.read(file9, 'fasta')
    sequences10 = AlignIO.read(file10, 'fasta')

    complete_sequences = []

    for sequence1 in sequences1:
        strain_name = util.get_strain_name(sequence1)
        sequence2 = util.get_matching_sequence(sequences2,
                                               strain_name=strain_name)
        sequence3 = util.get_matching_sequence(sequences3,
                                               strain_name=strain_name)
        sequence4 = util.get_matching_sequence(sequences4,
                                               strain_name=strain_name)
        sequence5 = util.get_matching_sequence(sequences5,
                                               strain_name=strain_name)
        sequence6 = util.get_matching_sequence(sequences6,
                                               strain_name=strain_name)
        sequence7 = util.get_matching_sequence(sequences7,
                                               strain_name=strain_name)
        sequence8 = util.get_matching_sequence(sequences8,
                                               strain_name=strain_name)
        sequence9 = util.get_matching_sequence(sequences9,
                                               strain_name=strain_name)
        sequence10 = util.get_matching_sequence(sequences10,
                                                strain_name=strain_name)

        if (sequence2 and sequence3 and sequence4 and sequence5 and sequence6
                and sequence7 and sequence8 and sequence9 and sequence10):
            complete_sequence = []
            complete_sequence.append(util.get_strain_name(sequence1))
            complete_sequence.append(sequence1.seq + sequence2.seq +
                                     sequence3.seq + sequence4.seq +
                                     sequence5.seq + sequence6.seq +
                                     sequence7.seq + sequence8.seq +
                                     sequence9.seq + sequence10.seq)
            complete_sequences.append(complete_sequence)

    return complete_sequences
Exemplo n.º 3
0
def combine_2_fastas(file1, file2):

    sequences1 = AlignIO.read(file1, 'fasta')
    sequences2 = AlignIO.read(file2, 'fasta')

    records = []
    seq_id = 1

    for sequence1 in sequences1:
        strain_name = util.get_strain_name(sequence1)
        sequence2 = util.get_matching_sequence(sequences2,
                                               strain_name=strain_name)

        if sequence2 is not None:
            records.append(
                SeqRecord(Seq(str(sequence1.seq + sequence2.seq)),
                          id=str(seq_id),
                          name='HA_NA',
                          description='HA_NA'))
            seq_id += 1

    print(len(records))
    with open("ha_na.fasta", "w") as handle:
        SeqIO.write(records, handle, "fasta")

    handle.close()
Exemplo n.º 4
0
def update_year_counts(record):

    strain_name = util.get_strain_name(record)
    year        = strain_name.split('/')[-1]
    if year in strains_by_year:
        strains_by_year[year].append(strain_name)
    else:
        strains_by_year[year]   = [strain_name]
Exemplo n.º 5
0
def create_unique_sequences(fasta_file, unique_strains_file):
    sequences = AlignIO.read(fasta_file, 'fasta')
    strains = list(util.read_file(unique_strains_file))[0]
    new_fasta_sequences = []

    for sequence in sequences:
        strain_name = util.get_strain_name(sequence)
        if strain_name in strains:
            new_fasta_sequences.append(sequence)
            strains.remove(strain_name)

    return new_fasta_sequences
Exemplo n.º 6
0
def create_unique_sequences(fasta_file, unique_strains_file):
    sequences = AlignIO.read(fasta_file, 'fasta')
    strains = list(util.read_file(unique_strains_file))[0]
    new_fasta_sequences = []

    for sequence in sequences:
        strain_name = util.get_strain_name(sequence)
        if strain_name in strains:
            new_fasta_sequences.append(sequence)
            strains.remove(strain_name)

    return new_fasta_sequences
Exemplo n.º 7
0
def get_strain_names(fasta_file, k, max_year):

    strain_names    = []
    strains_by_year = {}

    for record in SeqIO.parse(fasta_file, "fasta"):

        description_list = record.description.split('|')

        if (len(description_list)<4):
            print("protein description too short ", record.description)
            continue

        if 'HA' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            year        = strain_name.split('/')[-1]
            if year in strains_by_year:
                strains_by_year[year].append(strain_name)
            else:
                strains_by_year[year]   = [strain_name]

    # what are the counts per year?
    for key, value in strains_by_year.items():
        print(key, len(value))

    for key, value in strains_by_year.items():
        try:
            year = int(key)
        except Exception:
            year = 0
        if year < max_year:
            print(key + ' is less than ' + str(max_year))
            if len(value) > k:
                small_list = random.sample(value, k=k)
                for strain_name in small_list:
                    strain_names.append(strain_name)
            else:
                for strain_name in value:
                    strain_names.append(strain_name)
        else:
            print(key + ' is greater than ' + str(max_year))

    return strain_names
Exemplo n.º 8
0
def get_strain_names(fasta_file, k, max_year):

    strain_names = []
    strains_by_year = {}

    for record in SeqIO.parse(fasta_file, "fasta"):

        description_list = record.description.split('|')

        if (len(description_list) < 4):
            print("protein description too short ", record.description)
            continue

        if 'HA' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            year = strain_name.split('/')[-1]
            if year in strains_by_year:
                strains_by_year[year].append(strain_name)
            else:
                strains_by_year[year] = [strain_name]

    # what are the counts per year?
    for key, value in strains_by_year.items():
        print(key, len(value))

    for key, value in strains_by_year.items():
        try:
            year = int(key)
        except Exception:
            year = 0
        if year < max_year:
            print(key + ' is less than ' + str(max_year))
            if len(value) > k:
                small_list = random.sample(value, k=k)
                for strain_name in small_list:
                    strain_names.append(strain_name)
            else:
                for strain_name in value:
                    strain_names.append(strain_name)
        else:
            print(key + ' is greater than ' + str(max_year))

    return strain_names
Exemplo n.º 9
0
def custom_split():

    fasta_file = sys.argv[1]
    strain_names = get_strain_names(fasta_file, k=100, max_year=2020)

    ha_sequences = []
    na_sequences = []
    np_sequences = []
    m1_sequences = []
    m2_sequences = []
    pb1_sequences = []
    pb2_sequences = []
    ns1_sequences = []
    ns2_sequences = []
    pa_sequences = []

    for record in SeqIO.parse(fasta_file, "fasta"):

        description_list = record.description.split('|')

        if (len(description_list) < 4):
            print("protein description too short ", record.description)
            continue

        if 'HA' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                ha_sequences.append(record)

        if 'NA' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                na_sequences.append(record)

        if 'NP' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                np_sequences.append(record)

        if 'M1' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                m1_sequences.append(record)

        if 'M2' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                m2_sequences.append(record)

        if 'PB1 Polymerase (basic) protein 1' in description_list[5] or 'PB1 Polymerase (basic) protein 1' in \
                description_list[4]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                pb1_sequences.append(record)

        if 'PB2' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                pb2_sequences.append(record)

        if 'NS1 Non-structural protein 1' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                ns1_sequences.append(record)

        if 'NS2 Non-structural protein 2' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                ns2_sequences.append(record)

        if 'PA Polymerase (acidic) protein' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                pa_sequences.append(record)

    output_handle = open("ha.fasta", "w")
    print("length of ha sequences ", len(ha_sequences))
    SeqIO.write(ha_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("na.fasta", "w")
    print("length of na sequences ", len(na_sequences))
    SeqIO.write(na_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("m1.fasta", "w")
    print("length of m1 sequences ", len(m1_sequences))
    SeqIO.write(m1_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("m2.fasta", "w")
    print("length of m2 sequences ", len(m2_sequences))
    SeqIO.write(m2_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("np.fasta", "w")
    print("length of np sequences ", len(np_sequences))
    SeqIO.write(np_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("pb1.fasta", "w")
    print("length of pb1 sequences ", len(pb1_sequences))
    SeqIO.write(pb1_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("pb2.fasta", "w")
    print("length of pb2 sequences ", len(pb2_sequences))
    SeqIO.write(pb2_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("ns1.fasta", "w")
    print("length of ns1 sequences ", len(ns1_sequences))
    SeqIO.write(ns1_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("ns2.fasta", "w")
    print("length of ns2 sequences ", len(ns2_sequences))
    SeqIO.write(ns2_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("pa.fasta", "w")
    print("length of pa sequences ", len(pa_sequences))
    SeqIO.write(pa_sequences, output_handle, "fasta")
    output_handle.close()
Exemplo n.º 10
0
def custom_split():

    fasta_file = sys.argv[1]
    strain_names = get_strain_names(fasta_file, k=100, max_year=2020)

    ha_sequences = []
    na_sequences = []
    np_sequences = []
    m1_sequences = []
    m2_sequences = []
    pb1_sequences = []
    pb2_sequences = []
    ns1_sequences = []
    ns2_sequences = []
    pa_sequences = []

    for record in SeqIO.parse(fasta_file, "fasta"):

        description_list = record.description.split('|')

        if (len(description_list) < 4):
            print("protein description too short ", record.description)
            continue

        if 'HA' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                ha_sequences.append(record)

        if 'NA' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                na_sequences.append(record)

        if 'NP' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                np_sequences.append(record)

        if 'M1' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                m1_sequences.append(record)

        if 'M2' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                m2_sequences.append(record)

        if 'PB1 Polymerase (basic) protein 1' in description_list[5] or 'PB1 Polymerase (basic) protein 1' in \
                description_list[4]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                pb1_sequences.append(record)

        if 'PB2' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                pb2_sequences.append(record)

        if 'NS1 Non-structural protein 1' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                ns1_sequences.append(record)

        if 'NS2 Non-structural protein 2' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                ns2_sequences.append(record)

        if 'PA Polymerase (acidic) protein' in description_list[5]:
            strain_name = gau.get_strain_name(record)
            if strain_name in strain_names:
                pa_sequences.append(record)

    output_handle = open("ha.fasta", "w")
    print("length of ha sequences ", len(ha_sequences))
    SeqIO.write(ha_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("na.fasta", "w")
    print("length of na sequences ", len(na_sequences))
    SeqIO.write(na_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("m1.fasta", "w")
    print("length of m1 sequences ", len(m1_sequences))
    SeqIO.write(m1_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("m2.fasta", "w")
    print("length of m2 sequences ", len(m2_sequences))
    SeqIO.write(m2_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("np.fasta", "w")
    print("length of np sequences ", len(np_sequences))
    SeqIO.write(np_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("pb1.fasta", "w")
    print("length of pb1 sequences ", len(pb1_sequences))
    SeqIO.write(pb1_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("pb2.fasta", "w")
    print("length of pb2 sequences ", len(pb2_sequences))
    SeqIO.write(pb2_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("ns1.fasta", "w")
    print("length of ns1 sequences ", len(ns1_sequences))
    SeqIO.write(ns1_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("ns2.fasta", "w")
    print("length of ns2 sequences ", len(ns2_sequences))
    SeqIO.write(ns2_sequences, output_handle, "fasta")
    output_handle.close()

    output_handle = open("pa.fasta", "w")
    print("length of pa sequences ", len(pa_sequences))
    SeqIO.write(pa_sequences, output_handle, "fasta")
    output_handle.close()