Exemplo n.º 1
0
    def parse_fastq(r2_file: str, stags_file_name: str,
                    index_file_name: str) -> tuple:
        # get file name without extension
        try:
            base_out_file_name = R2.get_file_name(r2_file)
        except TypeError as te:
            raise

        # read sample tags
        log.info('R2: Reading sample tags file')
        stags: dict = R2.read_st(stags_file_name)

        # read cartridge index
        log.info('R2: Reading cartridge index list')
        cart_idx: dict = R2.read_cartridge_index(index_file_name)

        r2_passed = {}
        r2_dropped = set()
        with gzopen(r2_file, 'rt') as r2_f:
            r2_f.seek(0, 2)
            uncompressed_size = r2_f.tell()
            r2_f.seek(0)

            with tqdm(total=uncompressed_size) as pbar:
                for header in r2_f:
                    header = header.strip()
                    pbar.update(len(header))

                    # the read index is located in the last 8 character of the header
                    read_idx = header[-8:]
                    if read_idx in cart_idx:
                        read_seq = r2_f.readline().strip()
                        pbar.update(len(read_seq))

                        # Check Read length and Highest Single Nucleotide Frequency (SNF)
                        if not R2.has_minimum_read_length(
                                read_seq) or not R2.check_snf(read_seq):
                            r2_dropped.add((header, "min_length or snf"))
                            r2_f.readline()  # skip "+"
                            temp = r2_f.readline()  # skip qual line
                            pbar.update(len(temp) + 1)
                        else:
                            temp_st_name = [
                                st_names for st, st_names in stags.items()
                                if st in read_seq
                            ]

                            r2_f.readline()  # skip "+"
                            read_qual = r2_f.readline().strip(
                            )  # process qual line
                            pbar.update(len(read_qual) + 1)
                            if len(temp_st_name) == 1:
                                if not R2.has_minimum_quality_value(read_qual):
                                    r2_dropped.add((header, "min_qual"))
                                else:
                                    r2_passed[
                                        header] = base_out_file_name + "_" + temp_st_name[
                                            0] + "_" + cart_idx[read_idx]
                            elif len(temp_st_name) > 1:
                                r2_dropped.add((header, "multiple_st"))
                    else:
                        r2_dropped.add((header, "cart_idx"))
                        temp = len(r2_f.readline())  # skip fasta line
                        r2_f.readline()  # skip "+"
                        temp = temp + len(
                            r2_f.readline().strip()) + 1  # skip qual line
                        pbar.update(temp)

                pbar.update(uncompressed_size)
                pbar.set_description(f"Processed {uncompressed_size} reads")
        return r2_passed, r2_dropped
Exemplo n.º 2
0
def main(r1: str, r2: str, r2_map: str, bed: str, read_length: int, stags_file_name: str, index_file_name: str, matrix_name: str):
    log.info('Processing R2')
    # read alignment passed/dropped read headers
    r2_map_passed: dict
    r2_map_dropped: set
    r2_map_passed, r2_map_dropped = R2.parse_bam(r2_map, bed)
    
    ###testing
    r2_map_passed['NS500299:176:HH2GTBGXG:1:11101:7200:1063'] =  'PTPRC'
    r2_map_passed['NS500299:176:HH2GTBGXG:1:11101:7201:1063'] =  'PIK3AP1'
    r2_map_passed['NS500299:176:HH2GTBGXG:1:11101:7202:1063'] =  'PIK3AP1'
    r2_map_passed['NS500299:176:HH2GTBGXG:1:11101:7203:1063'] =  'PIK3AP1'
    r2_map_passed['NS500299:176:HH2GTBGXG:1:11101:7204:1063'] =  'SOD2'
    r2_map_passed['NS500299:176:HH2GTBGXG:1:11101:7205:1063'] =  'SOD2'
    ###
    
    print('r2_map_passed')
    for k,v in r2_map_passed.items():
        print(k,v)
    
    #print('r2_map_dropped')
    #for x in r2_map_dropped:
    #    print(x)

    r2_passed: dict
    r2_dropped: set
    r2_passed, r2_dropped = R2.parse_fastq(r2, stags_file_name, index_file_name)
    
    #print('r2_passed')
    #for k,v in r2_passed.items():
    #    print(k,v)
    
    #print('r2_dropped')
    #for x in r2_dropped:
    #    print(x)

    # merge dropped reads in R2 and dispose variables
    r2_all_dropped: set = r2_map_dropped.union(r2_dropped)
    del r2_map_dropped
    del r2_dropped
    
    # r2_all_dropped.clear()  # needed for testing r1.py

    log.info('Processing R1')
    r1_passed: dict
    r1_dropped: list
    r1_passed, r1_dropped = R1.parse_fastq(r1, read_length, r2_all_dropped)
    
    ###testing
    r1_passed['@NS500299:176:HH2GTBGXG:1:11101:7200:1063 1:N:0:CGAGGCTG'] = ('ATCACGTTA', 'CGATGTTTA', 'TTAGGCATA', 'CCCCAGGG')
    r1_passed['@NS500299:176:HH2GTBGXG:1:11101:7201:1063 1:N:0:CGAGGCTG'] = ('GAGTACATT', 'TACTAGTCA', 'ATCGAGTCT', 'CCCCGGGG')
    r1_passed['@NS500299:176:HH2GTBGXG:1:11101:7202:1063 1:N:0:CGAGGCTG'] = ('GAGTACATT', 'TACTAGTCA', 'ATCGAGTCT', 'CCCCGGGG')
    r1_passed['@NS500299:176:HH2GTBGXG:1:11101:7203:1063 1:N:0:CGAGGCTG'] = ('GAGTACATT', 'TACTAGTCA', 'ATCGAGTCT', 'CCCCAGGG')
    r1_passed['@NS500299:176:HH2GTBGXG:1:11101:7204:1063 1:N:0:CGAGGCTG'] = ('GAGTACATT', 'TACTAGTCA', 'ATCGAGTCT', 'CCCCGGGG')
    r1_passed['@NS500299:176:HH2GTBGXG:1:11101:7205:1063 1:N:0:CGAGGCTG'] = ('GAGTATTAG', 'TACTAGTCA', 'ATCGAGTCT', 'CCCCGGGG')
    ###
    
    print('r1_passed')
    for k,v in r1_passed.items():
        print(k,v)
        
    #print('r1_dropped')
    #for x in r1_dropped:
    #    print(x)
    
    log.info('Generating expression matrix')
    r1_compiled: dict  = Matrix.compile_r1_passed(r1_passed)
    
    del r1_passed
    del r1_dropped
    
    #print('r1_compiled')
    #for k,v in r1_compiled.items():
    #    print(k,v)
    
    r1_map_info: dict = Matrix.merge_r1_map_info(r1_compiled, r2_map_passed)
    
    del r2_map_passed
    
    #print('r1_map_info')
    #for k,v in r1_map_info.items():
    #    print(k,v)
    
    gene_symbols: list = Matrix.get_gene_symbols(bed)
    
    rasd_df: pd.DataFrame = Matrix.generate_rasd_matrix(r1_compiled, r1_map_info, gene_symbols)
    
    del r1_compiled
    del r1_map_info
    
    #rasd_df.to_csv('rasd_df.csv')
    #print('rasd_df')
    
    rsec_df: pd.DataFrame = Matrix().generate_rsec_matrix(rasd_df, gene_symbols)
    
    del rasd_df
    
    matrix_path = '../files/' + matrix_name + '.csv'
    
    rsec_df.to_csv(matrix_path)
Exemplo n.º 3
0
def main(r1: str, r2: str, r2_map: str, bed: str, read_length: int,
         stags_file_name: str, index_file_name: str, matrix_name: str):
    '''
    log.info('Processing R2')
    # read alignment passed/dropped read headers
    r2_map_passed: dict
    r2_map_dropped: set
    r2_map_passed, r2_map_dropped = R2.parse_bam(r2_map, bed)

    r2_passed: dict
    r2_dropped: set
    r2_passed, r2_dropped = R2.parse_fastq(r2, stags_file_name, index_file_name)

    # merge dropped reads in R2 and dispose variables
    r2_all_dropped = r2_map_dropped.union(r2_dropped)
    del r2_map_dropped
    del r2_dropped
    
    log.info('Processing R1')
    r1_passed: dict
    r1_dropped: list
    r1_passed, r1_dropped = R1.parse_fastq(r1, read_length, r2_all_dropped)
    
    # merge dropped reads
    # r2_passed_good = R2.purge_reads(r2_passed, r1_dropped)
    # r1_passed_good = R1.purge_reads(r1_passed, r2_dropped)
    '''

    log.info('Generating expression matrix')
    #r1_compiled: dict  = Matrix.compile_r1_passed(r1_passed)

    #del r1_passed
    #del r1_dropped

    #r1_map_info: dict = Matrix.merge_r1_map_info(r1_compiled, r2_map_passed)

    #del r2_map_passed

    gene_symbols: list = Matrix.get_gene_symbols(bed)

    #rasd_df: pd.DataFrame = Matrix.generate_rasd_matrix(r1_compiled, r1_map_info, gene_symbols)
    # edit file name
    rasd_df: pd.DataFrame = pd.read_csv('../files/cov1-200723_rasd_rebuild.csv', header=0, index_col=0, \
        dtype={'ABCB1': 'uint16', 'ABCB4': 'uint16', 'ADA': 'uint16', 'ADGRE1': 'uint16', 'ADGRG3': 'uint16', 'ADORA1': 'uint16', \
        'AIM2': 'uint16', 'ALAS2': 'uint16', 'ANXA5': 'uint16', 'AOC3': 'uint16', 'APOBEC3G': 'uint16', 'APOE': 'uint16', \
        'AQP4': 'uint16', 'AQP9': 'uint16', 'ARG1': 'uint16', 'ARL4C': 'uint16', 'ARNTL': 'uint16', 'ARNTL2': 'uint16', \
        'ATF6B': 'uint16', 'AURKB': 'uint16', 'AZU1': 'uint16', 'B3GAT1': 'uint16', 'BACH2': 'uint16', 'BAX': 'uint16', \
        'BCL11B': 'uint16', 'BCL2': 'uint16', 'BCL2A1': 'uint16', 'BCL2L11': 'uint16', 'BCL6': 'uint16', 'BHLHE41': 'uint16', \
        'BIN2': 'uint16', 'BIRC3': 'uint16', 'BLK': 'uint16', 'BLNK': 'uint16', 'BPI': 'uint16', 'BTG1': 'uint16', 'BTLA': 'uint16', \
        'C10orf54': 'uint16', 'C1QA': 'uint16', 'C1QB': 'uint16', 'CASP5': 'uint16', 'CBLB': 'uint16', 'CCL1': 'uint16', \
        'CCL13': 'uint16', 'CCL17': 'uint16', 'CCL19': 'uint16', 'CCL2': 'uint16', 'CCL20': 'uint16', 'CCL22': 'uint16', \
        'CCL3': 'uint16', 'CCL4': 'uint16', 'CCL5': 'uint16', 'CCND2': 'uint16', 'CCR1': 'uint16', 'CCR10': 'uint16', 'CCR2': 'uint16', \
        'CCR3': 'uint16', 'CCR4': 'uint16', 'CCR5': 'uint16', 'CCR7': 'uint16', 'CCR8': 'uint16', 'CCR9': 'uint16', 'CD14': 'uint16', \
        'CD160': 'uint16', 'CD163': 'uint16', 'CD1A': 'uint16', 'CD1B': 'uint16', 'CD1C': 'uint16', 'CD2': 'uint16', 'CD200': 'uint16', \
        'CD209': 'uint16', 'CD22': 'uint16', 'CD24': 'uint16', 'CD244': 'uint16', 'CD247': 'uint16', 'CD27': 'uint16', 'CD274': 'uint16', \
        'CD28': 'uint16', 'CD300A': 'uint16', 'CD33': 'uint16', 'CD34': 'uint16', 'CD36': 'uint16', 'CD37': 'uint16', 'CD38': 'uint16', \
        'CD3D': 'uint16', 'CD3E': 'uint16', 'CD3G': 'uint16', 'CD4': 'uint16', 'CD40': 'uint16', 'CD44': 'uint16', 'CD48': 'uint16', \
        'CD5': 'uint16', 'CD52': 'uint16', 'CD6': 'uint16', 'CD63': 'uint16', 'CD69': 'uint16', 'CD7': 'uint16', 'CD70': 'uint16', \
        'CD72': 'uint16', 'CD74': 'uint16', 'CD79A': 'uint16', 'CD79B': 'uint16', 'CD80': 'uint16', 'CD86': 'uint16', 'CD8A': 'uint16', 'CD8B': 'uint16', 'CD9': 'uint16', 'CEACAM8': 'uint16', 'CEBPD': 'uint16', 'CHI3L1': 'uint16', 'CHI3L2': 'uint16', 'CLC': 'uint16', 'CLEC10A': 'uint16', 'CLEC4D': 'uint16', 'CLEC4E': 'uint16', 'CLOCK': 'uint16', 'CMKLR1': 'uint16', 'CMTM2': 'uint16', 'CNOT2': 'uint16', 'CNTNAP3': 'uint16', 'COL1A2': 'uint16', 'CPA3': 'uint16', 'CR2': 'uint16', 'CRY1': 'uint16', 'CRY2': 'uint16', 'CSF2': 'uint16', 'CSF3': 'uint16', 'CST7': 'uint16', 'CTLA4': 'uint16', 'CTSD': 'uint16', 'CTSG': 'uint16', 'CTSW': 'uint16', 'CX3CR1': 'uint16', 'CXCL1': 'uint16', 'CXCL10': 'uint16', 'CXCL11': 'uint16', 'CXCL13': 'uint16', 'CXCL16': 'uint16', 'CXCL2': 'uint16', 'CXCL3': 'uint16', 'CXCL5': 'uint16', 'CXCL8': 'uint16', 'CXCL9': 'uint16', 'CXCR1': 'uint16', 'CXCR2': 'uint16', 'CXCR3': 'uint16', 'CXCR4': 'uint16', 'CXCR5': 'uint16', 'CXCR6': 'uint16', 'DBP': 'uint16', 'DEC1': 'uint16', 'DEFA3': 'uint16', 'DEFA4': 'uint16', 'DOCK8': 'uint16', 'DPP4': 'uint16', 'DUSP1': 'uint16', 'DUSP2': 'uint16', 'DUSP4': 'uint16', 'E': 'uint16', 'EBF1': 'uint16', 'EGR1': 'uint16', 'EGR3': 'uint16', 'ELANE': 'uint16', 'ENTPD1': 'uint16', 'EOMES': 'uint16', 'EPHA1': 'uint16', 'EPX': 'uint16', 'F13A1': 'uint16', 'F5': 'uint16', 'FAM129C': 'uint16', 'FAM65B': 'uint16', 'FAS': 'uint16', 'FASLG': 'uint16', 'FCER1A': 'uint16', 'FCER1G': 'uint16', 'FCER2': 'uint16', 'FCGR3A': 'uint16', 'FCN1': 'uint16', 'FLT3': 'uint16', 'FN1': 'uint16', 'FOSB': 'uint16', 'FOSL1': 'uint16', 'FOXO1': 'uint16', 'FOXP1': 'uint16', 'FOXP3': 'uint16', 'FTH1': 'uint16', 'FUT4': 'uint16', 'FYB': 'uint16', 'FYN': 'uint16', 'GAB2': 'uint16', 'GAPDH': 'uint16', 'GIMAP2': 'uint16', 'GIMAP5': 'uint16', 'GNAI2': 'uint16', 'GNLY': 'uint16', 'GZMA': 'uint16', 'GZMB': 'uint16', 'GZMH': 'uint16', 'GZMK': 'uint16', 'HAVCR2': 'uint16', 'HLA-A': 'uint16', 'HLA-DMA': 'uint16', 'HLA-DPA1': 'uint16', 'HLA-DQB1': 'uint16', 'HLA-DRA': 'uint16', 'HLF': 'uint16', 'HMMR': 'uint16', 'HSP90AA1': 'uint16', 'ICAM1': 'uint16', 'ICOS': 'uint16', 'IER3': 'uint16', 'IFITM2': 'uint16', 'IFITM3': 'uint16', 'IFNA1': 'uint16', 'IFNG': 'uint16', 'IFNGR1': 'uint16', 'IGBP1': 'uint16', 'IGFBP2': 'uint16', 'IGHA1': 'uint16', 'IGHD': 'uint16', 'IGHE': 'uint16', 'IGHG1': 'uint16', 'IGHG2': 'uint16', 'IGHG3': 'uint16', 'IGHG4': 'uint16', 'IGHM': 'uint16', 'IGKC': 'uint16', 'IGLC3': 'uint16', 'IKZF1': 'uint16', 'IKZF2': 'uint16', 'IL12A': 'uint16', 'IL12RB1': 'uint16', 'IL12RB2': 'uint16', 'IL13': 'uint16', 'IL15': 'uint16', 'IL15RA': 'uint16', 'IL17A': 'uint16', 'IL17F': 'uint16', 'IL18': 'uint16', 'IL18R1': 'uint16', 'IL18RAP': 'uint16', 'IL1B': 'uint16', 'IL1R2': 'uint16', 'IL1RL1': 'uint16', 'IL1RN': 'uint16', 'IL2': 'uint16', 'IL21': 'uint16', 'IL22': 'uint16', 'IL23R': 'uint16', 'IL25': 'uint16', 'IL2RA': 'uint16', 'IL2RB': 'uint16', 'IL3': 'uint16', 'IL31': 'uint16', 'IL32': 'uint16', 'IL33': 'uint16', 'IL3RA': 'uint16', 'IL4': 'uint16', 'IL4R': 'uint16', 'IL5': 'uint16', 'IL6': 'uint16', 'IL7R': 'uint16', 'IL9': 'uint16', 'IRF1': 'uint16', 'IRF4': 'uint16', 'IRF6': 'uint16', 'IRF8': 'uint16', 'IRF9': 'uint16', 'ITGA4': 'uint16', 'ITGAE': 'uint16', 'ITGAM': 'uint16', 'ITGAX': 'uint16', 'ITGB2': 'uint16', 'JCHAIN': 'uint16', 'JUN': 'uint16', 'JUNB': 'uint16', 'KCNE3': 'uint16', 'KCNK5': 'uint16', 'KDELR1': 'uint16', 'KIAA0101': 'uint16', 'KIR2DL1': 'uint16', 'KIT': 'uint16', 'KITLG': 'uint16', 'KLRB1': 'uint16', 'KLRC1': 'uint16', 'KLRC3': 'uint16', 'KLRC4': 'uint16', 'KLRF1': 'uint16', 'KLRG1': 'uint16', 'KLRK1': 'uint16', 'LAG3': 'uint16', 'LAIR2': 'uint16', 'LAMP1': 'uint16', 'LAMP3': 'uint16', 'LAP3': 'uint16', 'LAT': 'uint16', 'LAT2': 'uint16', 'LCK': 'uint16', 'LEF1': 'uint16', 'LGALS1': 'uint16', 'LGALS3': 'uint16', 'LGALS9': 'uint16', 'LIF': 'uint16', 'LILRB4': 'uint16', 'LIPA': 'uint16', 'LRRC32': 'uint16', 'LTA': 'uint16', 'LTB': 'uint16', 'LY86': 'uint16', 'LYN': 'uint16', 'M': 'uint16', 'MCM2': 'uint16', 'MCM4': 'uint16', 'MGST1': 'uint16', 'MITF': 'uint16', 'MKI67': 'uint16', 'MME': 'uint16', 'MMP12': 'uint16', 'MMP9': 'uint16', 'MS4A1': 'uint16', 'MTHFR': 'uint16', 'MYC': 'uint16', 'MZB1': 'uint16', 'N': 'uint16', 'NAMPT': 'uint16', 'NCAM1': 'uint16', 'NCR3': 'uint16', 'NFIL3': 'uint16', 'NFKBIA': 'uint16', 'NINJ2': 'uint16', 'NKG7': 'uint16', 'NPAS2': 'uint16', 'NR1D1': 'uint16', 'NR1D2': 'uint16', 'NRP1': 'uint16', 'NT5E': 'uint16', 'ORF10': 'uint16', 'ORF1ab': 'uint16', 'ORF3a': 'uint16', 'ORF6': 'uint16', 'ORF7a': 'uint16', 'ORF7b': 'uint16', 'ORF8': 'uint16', 'PASK': 'uint16', 'PAX5': 'uint16', 'PCNA': 'uint16', 'PDCD1': 'uint16', 'PDIA4': 'uint16', 'PDIA6': 'uint16', 'PER1': 'uint16', 'PER2': 'uint16', 'PER3': 'uint16', 'PI3': 'uint16', 'PIGF': 'uint16', 'PIK3AP1': 'uint16', 'PIK3IP1': 'uint16', 'PMCH': 'uint16', 'POU2AF1': 'uint16', 'PPARGC1B': 'uint16', 'PRDM1': 'uint16', 'PRF1': 'uint16', 'PSEN1': 'uint16', 'PSME2': 'uint16', 'PTGDR2': 'uint16', 'PTPRC': 'uint16', 'PTTG2': 'uint16', 'QPCT': 'uint16', 'RGS1': 'uint16', 'RNASE2': 'uint16', 'RNASE6': 'uint16', 'RORA': 'uint16', 'RORC': 'uint16', 'RPN2': 'uint16', 'RUNX3': 'uint16', 'S': 'uint16', 'S100A10': 'uint16', 'S100A12': 'uint16', 'S100A9': 'uint16', 'SDC4': 'uint16', 'SELL': 'uint16', 'SELPLG': 'uint16', 'SERPINB1': 'uint16', 'SERPINE2': 'uint16', 'SLC25A37': 'uint16', 'SLC7A7': 'uint16', 'SMAD3': 'uint16', 'SMAD4': 'uint16', 'SNCA': 'uint16', 'SOD2': 'uint16', 'SOX9': 'uint16', 'SPP1': 'uint16', 'STAT1': 'uint16', 'STAT3': 'uint16', 'STAT4': 'uint16', 'STAT5A': 'uint16', 'STAT6': 'uint16', 'TARP': 'uint16', 'TBX21': 'uint16', 'TCF4': 'uint16', 'TCF7': 'uint16', 'TCL1A': 'uint16', 'TEF': 'uint16', 'TGFB1': 'uint16', 'TGFB3': 'uint16', 'TGFBI': 'uint16', 'THBD': 'uint16', 'THBS1': 'uint16', 'TIAF1': 'uint16', 'TIGIT': 'uint16', 'TLR2': 'uint16', 'TLR7': 'uint16', 'TLR8': 'uint16', 'TLR9': 'uint16', 'TMEM97': 'uint16', 'TNF': 'uint16', 'TNFRSF13C': 'uint16', 'TNFRSF17': 'uint16', 'TNFRSF25': 'uint16', 'TNFRSF4': 'uint16', 'TNFRSF8': 'uint16', 'TNFRSF9': 'uint16', 'TNFSF10': 'uint16', 'TNFSF13': 'uint16', 'TNFSF13B': 'uint16', 'TNFSF14': 'uint16', 'TNFSF8': 'uint16', 'TOP2A': 'uint16', 'TPSAB1': 'uint16', 'TRAC': 'uint16', 'TRAT1': 'uint16', 'TRBC2': 'uint16', 'TRDC': 'uint16', 'TREM1': 'uint16', 'TRIB2': 'uint16', 'TSPAN32': 'uint16', 'TXK': 'uint16', 'TYMS': 'uint16', 'UBE2C': 'uint16', 'VCAM1': 'uint16', 'VEGFA': 'uint16', 'VMO1': 'uint16', 'VNN2': 'uint16', 'VPREB3': 'uint16', 'VPS28': 'uint16', 'VSIG4': 'uint16', 'XBP1': 'uint16', 'YBX3': 'uint16', 'YY1': 'uint16', 'ZAP70': 'uint16', 'ZBED2': 'uint16', 'ZBTB16': 'uint16', 'ZNF683': 'uint16'})

    #del r1_compiled
    #del r1_map_info

    rsec_df: pd.DataFrame = Matrix().generate_rsec_matrix(
        rasd_df, gene_symbols)

    del rasd_df

    matrix_path = '../files/' + matrix_name + '.csv'

    rsec_df.to_csv(matrix_path)
Exemplo n.º 4
0
    
    # dbec_df: pd.DataFrame = Matrix.generate_dbec_matrix(rsec_df)
    
    # print('dbec_df')
    # dbec_df.to_csv('dbec_df.csv')


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Input and output file arguments')
    parser.add_argument("--r1", help="R1 fastq file")
    parser.add_argument("--r2", help="R2 fastq file")
    parser.add_argument("--r2_bam", help="R2 bam file")
    parser.add_argument("--bed", help="Target genes coordinates")
    parser.add_argument("--tag", help="Sample tags file")
    parser.add_argument("--index", help="Cartridge indices file")
    parser.add_argument("--read_length", help="Sequencing full read length", type=int)
    parser.add_argument("--matrix", help="Output expression matrix file")
    args = parser.parse_args()

    r1 = args.r1
    r2 = args.r2
    r2_bam = args.r2_bam
    bed = args.bed
    st_f_name = args.tag
    ind_f_name = args.index
    read_length = args.read_length
    matrix_name = args.matrix

    log.info('Start processing BD Rhapsody reads')
    main(r1, r2, r2_bam, bed, read_length, st_f_name, ind_f_name, matrix_name)
Exemplo n.º 5
0
def main(r1: str, r2: str, r2_map: str, bed: str, read_length: int,
         stags_file_name: str, index_file_name: str, matrix_name: str):
    '''
    log.info('Processing R2')
    # read alignment passed/dropped read headers
    r2_map_passed: dict
    r2_map_dropped: set
    r2_map_passed, r2_map_dropped = R2.parse_bam(r2_map, bed)

    r2_passed: dict
    r2_dropped: set
    r2_passed, r2_dropped = R2.parse_fastq(r2, stags_file_name, index_file_name)

    # merge dropped reads in R2 and dispose variables
    r2_all_dropped = r2_map_dropped.union(r2_dropped)
    del r2_map_dropped
    del r2_dropped
    
    log.info('Processing R1')
    r1_passed: dict
    r1_dropped: list
    r1_passed, r1_dropped = R1.parse_fastq(r1, read_length, r2_all_dropped)
    
    # merge dropped reads
    # r2_passed_good = R2.purge_reads(r2_passed, r1_dropped)
    # r1_passed_good = R1.purge_reads(r1_passed, r2_dropped)
    '''

    r1_passed: dict = {}
    # edit file name
    with open('../files/r1_session_cov1.txt', 'r') as r1f:
        for line in r1f:
            line1 = line.split("'")
            r1_passed[line.split(' (')[0]] = (line1[1], line1[3], line1[5],
                                              line1[7])

    log.info('Generating expression matrix')
    r1_compiled: dict = Matrix.compile_r1_passed(r1_passed)

    del r1_passed
    #del r1_dropped

    r2_map_passed: dict = {}
    # edit file name
    with open('../files/mapping_session_cov1.txt', 'r') as mapf:
        for line in mapf:
            line1 = line.split()
            r2_map_passed[line1[0]] = (line1[1])

    r1_map_info: dict = Matrix.merge_r1_map_info(r1_compiled, r2_map_passed)

    del r2_map_passed

    gene_symbols: list = Matrix.get_gene_symbols(bed)

    rasd_df: pd.DataFrame = Matrix.generate_rasd_matrix(
        r1_compiled, r1_map_info, gene_symbols)

    del r1_compiled
    del r1_map_info

    matrix_path = '../files/' + matrix_name + '.csv'

    rasd_df.to_csv(matrix_path)