def create_protein_alignment(protein_id, species):
    '''
    Generates the SW alignment of three protein sequences:
    reference species protein, the assembled protein and the ensembl species protein
    @param protein_id: referent protein id
    @param species: species (latin)
    '''
    
    sequences_for_fasta = []
    
    dc                  = DirectoryCrawler()
    pc                  = ProteinContainer.Instance()
    dmc                 = DataMapContainer.Instance()
    acg                 = AlignmentCommandGenerator()
    tpc                 = TranslatedProteinContainer.Instance()
    
    data_map            = dmc.get((protein_id, species))
    
    # get all the proteins
    ref_protein         = pc.get(protein_id)
    species_protein     = pc.get(data_map.protein_id)
    assembled_protein   = tpc.get(protein_id, species)
    
    sequences_for_fasta.append(ref_protein.get_sequence_record())
    sequences_for_fasta.append(assembled_protein.get_sequence_record())
    sequences_for_fasta.append(species_protein.get_sequence_record())
    
    msa_fasta       = "%s/%s.fa" % (dc.get_mafft_path(protein_id), species)
    msa_afa         = "%s/%s.afa" % (dc.get_mafft_path(protein_id), species)
    msa_fasta_file  = open(msa_fasta, "w")
    SeqIO.write(sequences_for_fasta, msa_fasta_file, "fasta")
    msa_fasta_file.close()
    
    mafft_cmd = acg.generate_mafft_command(msa_fasta, msa_afa)
    os.system(mafft_cmd)
def create_msa_alignments ():
    
    dc = DirectoryCrawler()
    pc = ProteinContainer.Instance()
    dmc = DataMapContainer.Instance()
    acg = AlignmentCommandGenerator()
    
    fill_all_containers(False)
    
    
    
    for (prot_id, exon_num) in get_protein_list():
        
        if not check_status_file(prot_id):
            continue
        
        ref_prot_rec = pc.get(prot_id).get_sequence_record()
        
        exoloc_proteins = []
        ensembl_proteins = []
        
        exoloc_proteins.append(ref_prot_rec)
        ensembl_proteins.append(ref_prot_rec)
        
        assembled_dir = dc.get_assembled_protein_path(prot_id)
        for fasta in sorted(os.listdir(assembled_dir)):
            if fasta == "Homo_sapiens.fa":
                continue
            abs_fasta = "%s/%s" % (assembled_dir, fasta)
            prot_rec = load_fasta_single_record(abs_fasta, IUPAC.protein)
            exoloc_proteins.append(prot_rec)
            
        species_list = get_species_list(prot_id, None)
        for species in species_list:
            if species == "Homo_sapiens":
                continue
            data_map = dmc.get((prot_id, species))
            prot_rec = pc.get(data_map.protein_id).get_sequence_record()
            prot_rec.id = species
            ensembl_proteins.append(prot_rec) 
            
        msa_exoloc_path = "%s/msa_exoloc.fa" % dc.get_mafft_path(prot_id)
        msa_ensembl_path = "%s/msa_ensembl.fa" % dc.get_mafft_path(prot_id)
        
        write_seq_records_to_file(msa_exoloc_path, exoloc_proteins)
        write_seq_records_to_file(msa_ensembl_path, ensembl_proteins)
        
        cmd = acg.generate_mafft_command(msa_exoloc_path, "%s/msa_exoloc.afa" % dc.get_mafft_path(prot_id))
        print cmd
        os.system(cmd)
        cmd = acg.generate_mafft_command(msa_ensembl_path, "%s/msa_ensembl.afa" % dc.get_mafft_path(prot_id))
        print  cmd
        os.system(cmd)
def create_species_msa_alignments ():
    
    dc = DirectoryCrawler()
    pc = ProteinContainer.Instance()
    dmc = DataMapContainer.Instance()
    acg = AlignmentCommandGenerator()
    
    fill_all_containers(False)
    

    for (prot_id, exon_num) in get_protein_list():
        
        if not check_status_file(prot_id):
            continue
        
        ref_prot_rec = pc.get(prot_id).get_sequence_record()
        ref_prot_rec.id = "Homo_sapiens"

        assembled_dir = dc.get_assembled_protein_path(prot_id)    
        species_list = get_species_list(prot_id, None)
        
        for species in species_list:
            
            protein_recs = []
            protein_recs.append(ref_prot_rec)
            
            if species == "Homo_sapiens":
                continue
            data_map = dmc.get((prot_id, species))
            prot_rec = pc.get(data_map.protein_id).get_sequence_record()
            prot_rec.id = species
            protein_recs.append(prot_rec)
            
            if "%s.fa" % species in os.listdir(assembled_dir):
                exoloc_protein_rec = load_fasta_single_record("%s/%s.fa" % (assembled_dir, species), IUPAC.protein)
                protein_recs.append(exoloc_protein_rec)

            
            msa_species_path = "%s/%s.fa" % (dc.get_mafft_path(prot_id), species)
            
            if len(protein_recs) == 1:
                continue
            write_seq_records_to_file(msa_species_path, protein_recs)
            
            cmd = acg.generate_mafft_command(msa_species_path, "%s/%s.afa" % (dc.get_mafft_path(prot_id), species))
            print cmd
            os.system(cmd)
            
            os.remove(msa_species_path)
def main():
    
    '''
    Retrieves the list of all the proteins from reference species.
    For each ref species protein, it tries to find orthologues for all the species (from the species list)
    and generates the description file accordingly. If the description file already exists, it checks
    the status (OK/PARTIAL/FAILED).
    '''
    
    reference_species = "Homo_sapiens"
    
    dc = DirectoryCrawler()
    acg = AlignmentCommandGenerator()
    
    logger = Logger.Instance()
    mutual_best_logger = logger.get_logger('mutual_best')
    
    protein_list = get_protein_list()
    species_list = get_default_species_list()
    failed_proteins = []
    
    for (protein_id, num_of_exons) in protein_list:
        
        known_dict = {}
        abinitio_dict = {}
        print protein_id
        
        # generate all the directories for the protein
        dc.generate_directory_tree(protein_id)
        
        descr_file_path = dc.get_protein_description_file_path(protein_id)
        status_file_path = dc.get_mutual_best_status_file_path(protein_id)
        
        if (os.path.isfile(status_file_path) and os.path.getsize(status_file_path)):
            print DescriptionParser().get_protein_ids(protein_id)
            
            status_dict = read_status_file(protein_id)
            if (status_dict.has_key('MUTUAL_BEST')):
                if status_dict['MUTUAL_BEST'] == 'OK':
                    mutual_best_logger.info('-,%s,mutual_best already exists for this protein - moving to the next one' % protein_id)
                else :
                    mutual_best_logger.error('-,%s,mutual_best has failed for this protein (no orthologs found) - moving on the next one' % protein_id)
                    failed_proteins.append(protein_id)
            continue
        
        
        # create the description file
        descr_file = open(descr_file_path, 'w')
        # reference protein file
        ref_species_pep =  dc.get_protein_path(protein_id) + "/" + reference_species + ".fasta"
        fastacmd = acg.generate_fastacmd_protein_command(protein_id, reference_species, "all", ref_species_pep)
        
        p = Popen(fastacmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        output = p.stdout.read()
        if output:
            mutual_best_logger.error("%s,fastacmd error" % protein_id)
             
        # find orthologues for all species
        for species in species_list:
            find_ortholog_by_RBH (reference_species, species, ref_species_pep, protein_id, descr_file, mutual_best_logger)
            
        descr_file.close()
        
        mutual_best_logger.info("\n\n")
        
        # check what we've found out, whether this protein has any orthologs
        (known_dict, abinitio_dict) = DescriptionParser().get_protein_ids(protein_id)
        if (not abinitio_dict and (not known_dict or (len(known_dict.keys()) == 1 and known_dict.keys()[0] == reference_species))):
            mutual_best_logger.info ("-,%s, mutual best failed for this protein." % protein_id)
            update_entry_in_status_file(protein_id, "MUTUAL_BEST", "FAILED")
            failed_proteins.append(protein_id)
            
        else:
            update_entry_in_status_file(protein_id, "MUTUAL_BEST", "OK")
            
    print "Failed proteins: "        
    for failed_protein_id in failed_proteins:
        print failed_protein_id
def _search_for_ortholog_in_database(original_species, target_species, original_protein_fasta, original_protein_id, db_type, logger):
    '''
    Takes the reference species protein and makes a BLASTp query on the target_species database.
    Provided that there has been at least one hit, the best BLAST hit is queried against the
    reference species protein database. If the best hit from the second query is the original protein,
    then the protein data is returned. The queries as referred to as the forward and the backward
    hit respectively.  
    @param db_type: all / abinitio
    '''
    acg = AlignmentCommandGenerator()
    
    if (db_type == "all"):
        protein_id_pattern = re.compile("lcl\|(.*)\spep:(.*)\s(.*):(.*):(.*):(.*):(.*):(.*)\sgene:(.*)\stranscript:(.*)\s.*\s.*")
    else:
        protein_id_pattern = re.compile("lcl\|(.*)\spep:(.*)\s(.*):(.*):(.*):(.*):(.*):(.*)\stranscript:(.*)\s.*")
    
    output_file = "tmp.xml"
    forward_blastp_cmd = acg.generate_blastp_command_for_species(target_species, original_protein_fasta, output_file, db_type)
    print forward_blastp_cmd
    
    execute_command_and_log(logger, forward_blastp_cmd, (target_species, original_protein_id))
    
    result_handle = open(output_file)  
    blast_records = NCBIXML.parse(result_handle)
    
    try:
        best_forward_hit = blast_records.next()
    except (ValueError):
        logger.error("%s,%s,XML file empty - no forward blast results" % (target_species, original_protein_id))
        return None
        
    if (best_forward_hit.alignments):
        bfh_title = _get_best_alignment(original_protein_id, best_forward_hit)
        protein_match = re.match(protein_id_pattern, bfh_title)
    else:
        return None
    
    if (db_type == "all"):
        (protein_id, protein_type, location_type, assembly, location_id, seq_start, seq_end, strand, gene_id, transcript_id) = protein_match.groups()
    else:
        (protein_id, protein_type, location_type, assembly, location_id, seq_start, seq_end, strand, transcript_id) = protein_match.groups()
    
    fasta_input_file = "species.fasta"
    fastacmd = acg.generate_fastacmd_protein_command(protein_id, target_species, db_type, fasta_input_file)
    
    execute_command_and_log(logger, fastacmd, (target_species, original_protein_id))
    
    backward_blastp_cmd = acg.generate_blastp_command_for_species(original_species, fasta_input_file, output_file, "all")
    print backward_blastp_cmd
    
    execute_command_and_log(logger, backward_blastp_cmd, (target_species, original_protein_id))
    
    result_handle = open(output_file)  
    blast_records = NCBIXML.parse(result_handle)
    
    try:
        best_backward_hit = blast_records.next()
    except (ValueError):
        logger.error("%s,%s,XML file empty - no backward blast results" % (target_species, original_protein_id))
        return None
    
    if (best_backward_hit.alignments):
        bbh_title = _get_best_alignment(original_protein_id, best_backward_hit)
        protein_match_b = re.match(protein_id_pattern, bbh_title)
        protein_id_b = protein_match_b.groups()[0]
    else:
        return None
    
    os.remove(output_file)
    os.remove(fasta_input_file)
    
    if (original_protein_id == protein_id_b):
        if (db_type == "all"):
            return (protein_id, protein_type, location_type, assembly, location_id, seq_start, seq_end, strand, gene_id, transcript_id)
        else:
            return (protein_id, protein_type, location_type, assembly, location_id, seq_start, seq_end, strand, transcript_id)
    else:
        return None
        else:
            break
        i += 1
    
    pattern = re.compile("lcl\|(.*)\spep::*")
    for title in best_alignments:
        prot_match = re.match(pattern, title)
        if prot_match.groups()[0] == protein_id:
            return title
    return best_alignments[0]
    
    
    
if __name__ == '__main__':
    protein_id = "ENSP00000311134"
    acg = AlignmentCommandGenerator()
    dc = DirectoryCrawler()
    
    dc.generate_directory_tree(protein_id)
    descr_file_path = dc.get_protein_description_file_path(protein_id)
    descr_file = open(descr_file_path, 'w')
    
    output_file_path = dc.get_protein_path(protein_id) + "/" + "Homo_sapiens.fasta"
    
    fastacmd = acg.generate_fastacmd_protein_command(protein_id, "Homo_sapiens", "all", output_file_path)
    os.system(fastacmd)
    
    for species in get_default_species_list():
        find_ortholog_by_RBH("Homo_sapiens", species, output_file_path, protein_id)
        
    descr_file.close()