#!/usr/bin/env python '''Read and process blastp output file.''' from blastplus import BlastProcessor, BLASTP_OUTPUT_FILE bp = BlastProcessor(BLASTP_OUTPUT_FILE) # Write the subject ids to disk for sequence retrieval bp.uniq_hits().to_csv('../data/analyze-output/subject_ids.ref', header=False, index=False)
for query_file in query_files: with open(query_file, 'rU') as f: query_ids = [] for line in f.xreadlines(): if line[0] == COMMENT: query_ids += [line[1:-1]] # Leave out newline character filenames = len(query_ids) * [os.path.basename(query_file)] df_query = pd.DataFrame({ 'filename': filenames, 'query id': query_ids, }) df_queries = df_queries.append(df_query) df_queries = df_queries.reset_index() # Create DataFrame of blast results bp = BlastProcessor(BLASTP_OUTPUT_FILE) df_blast_results = bp.top_hits() # Merge the DataFrames to get the filename and blast hits df = pd.merge(df_queries, df_blast_results) df = df[['filename', 'subject id']] mask_dups = df.duplicated() df = df[~mask_dups] # Get unique values # Concatenate the query and subject sequences into the alignment files for query_filename, group in df.groupby('filename'): query_file = os.path.join(QUERY_DIR, query_filename) align_filename = os.path.splitext(query_filename)[0] + '.cat.faa' align_file = os.path.join(ALIGN_DIR, align_filename) with open(query_file, 'rU') as f: query = f.readlines()