示例#1
0
def get_all_metrics():
    sequences = get_list('InitialProtein')
    first_ids = []
    second_ids = []
    first_lengths = []
    second_lengths = []
    all_identity = []
    all_gaps = []
    for sequence1, sequence2 in it.combinations(sequences, 2):
        filename = get_pairwise_filename(sequence1, sequence2, aligned=True)
        alignment_metrics = get_metrics_from_filename(filename)
        first_ids.append(sequence1.id)
        second_ids.append(sequence2.id)
        first_lengths.append(alignment_metrics['first_length'])
        second_lengths.append(alignment_metrics['second_length'])
        all_identity.append(alignment_metrics['percent_identity'])
        all_gaps.append(alignment_metrics['gap_fraction'])
    metrics_df = pd.DataFrame({
        'id1': first_ids,
        'id2': second_ids,
        'length1': first_lengths,
        'length2': second_lengths,
        'identity': all_identity,
        'gaps': all_gaps
    })
    metrics_df['status'] = 'okay'
    isoforms = (metrics_df.identity == 1) & (metrics_df.gaps > 0)
    duplicates = (metrics_df.identity == 1) & (metrics_df.gaps == 0)
    metrics_df.loc[isoforms, 'status'] = 'isoform'
    metrics_df.loc[duplicates, 'status'] = 'duplicate'
    return metrics_df
示例#2
0
def make_sequence_files():
    print('Splitting protein sequences into pairwise files...')
    sequences = get_list('InitialProtein')
    for sequence1, sequence2 in it.combinations(sequences, 2):
        pairwise_filename = get_pairwise_filename(sequence1, sequence2)
        pairwise_path = os.path.join('data', 'pairwise', pairwise_filename)
        with open(pairwise_path, 'w') as output_file:
            SeqIO.write([sequence1, sequence2], output_file, 'fasta')
示例#3
0
	def process(self, input):
		"""
		统计专利的合作数据, 作为Y
		year + univ + 和别人合作数
		"""
		college_dir = input['college_dir']
		college_list = get_list(college_dir)

		if not os.path.exists('data.patent_pair'):
			os.system('cat ' + input['patent_dir'] +'/* > data.patent_pair')

		patent_dict = {}
		ipc_hot_dict = {}
		for line in open('data.patent_pair'):
			items = line.strip().split('\t')
			info = dict(zip(self.title_list, items))
			
			open_id = info['公开(公告)号']
			patent_type = 'A'
			if open_id.endswith('U'):
				patent_type = 'U'
			apply_year = info['申请日'][:4]
			apply_person = info['申请人']

			hit_college = []    # 可能多个
			for college in college_list:
				if college in apply_person:
					hit_college.append(college)
			#
			for college in hit_college:
				key = apply_year + '\t' + college
				if key not in patent_dict:
					patent_dict[key] = {'A': 0, 'U': 0}
				patent_dict[key][patent_type] += 1
			
		# 	### 统计热门IPC类别
		# 	if '2012'<= apply_year <= '2018':
		# 		ipc = info['主分类号'][:3]
		# 		if ipc not in ipc_hot_dict:
		# 			ipc_hot_dict[ipc] = 0
		# 		ipc_hot_dict[ipc] += 1
		# ##
		# ff = open('res_ipc_hot1.csv', 'w')
		# for ipc in ipc_hot_dict:
		# 	ff.write(ipc + '\t' + str(ipc_hot_dict[ipc]) + '\n')
		# ff.close()



		return {'yy_dict': patent_dict, 
				'college_list': college_list
				}
示例#4
0
def single_linkage_clustering(threshold, variable='both'):
    sequences = get_list('InitialProtein')
    sequence_ids = [sequence.id for sequence in sequences]
    number_of_sequences = len(sequences)
    metrics = pairwise.get_all_metrics()
    if variable == 'both':
        correct_identity = metrics.identity > threshold['identity']
        correct_gaps = metrics.gaps < threshold['gaps']
        metrics['cluster_variable'] = correct_identity & correct_gaps
    elif variable == 'identity':
        metrics['cluster_variable'] = metrics.identity > threshold
    current_assignment = np.arange(number_of_sequences)
    for _, row in metrics[metrics['cluster_variable']].iterrows():
        index1 = sequence_ids.index(row.id1)
        index2 = sequence_ids.index(row.id2)
        assignment1 = current_assignment[index1]
        assignment2 = current_assignment[index2]        
        current_assignment[current_assignment==assignment1] = assignment2
    return current_assignment
示例#5
0
def cluster_and_align(threshold):
    directory_path = os.path.join('data', 'clusters')
    clusters = single_linkage_clustering(threshold)
    sequences = get_list('InitialProtein')
    count = 0
    for cluster in set(clusters):
        indices = np.arange(len(clusters))[clusters==cluster]
        if len(indices) > 1:
            filename = 'cluster_%d.fasta' % count
            unaligned_path = os.path.join(directory_path, filename)
            cluster_sequences = []
            for index in indices:
                cluster_sequences.append(sequences[index])
            with open(unaligned_path, 'w') as output_file:
                SeqIO.write(cluster_sequences, output_file, 'fasta')
            aligned_filename = 'cluster_%d__ALIGNED.fasta' % count
            aligned_path = os.path.join(directory_path, aligned_filename)
            alignment_command = 'mafft %s > %s' % (unaligned_path, aligned_path)
            subprocess.call(alignment_command, shell=True)
            count += 1
示例#6
0
from tools import create_file, create_directory, get_list, delete_file, copy_file, save_info, change_dir
from game import game

print('Hello. File manager is ready to work.')

command = None
save_info('Programm started')

while command != 'exit':

    command = input(
        'Enter command or exit to end work. Enter help to list commands: ')

    if command == 'list':
        get_list()
    elif command == '0':
        name = input('Enter new path: ')
        change_dir(name)
    elif command == '1':
        name = input('Enter file name: ')
        if name == '':
            print('File name is missing')
            save_info('Error - File name is missing')
        else:
            create_file(name)
            save_info(f'File {name} is created')
    elif command == '2':
        name = input('Enter folder name: ')
        if name == '':
            print('Folder name is missing')
示例#7
0
 def setUp(self):
     self.nucleotide_records = get_list('InitialNucleotide')
     self.nucleotide_dictionary = get_dictionary('InitialNucleotide')
     self.protein_dictionary = get_dictionary('InitialProtein')