示例#1
0
def transferFeatures(hits):
    """
    In table: feature_cvterm
    RILEY              /class
    genedb_products    /product
    
    In table: featureprop
    EC_number          /EC_number
    colour             /colour
    gene               /gene
    """
    # Connect to geneDB as read only user using ropy.query
    query = ropy.query.QueryProcessor(connection=connectionFactory)
    query.setSQLFilePath(os.path.dirname(__file__) + "/sql/")
    
    for hit in hits:
        # Extract all cvterm related to a feature_id from feature_cvterm table
        query.addQueryFromFile("feature_cvterm_query", "get_cvterm_from_feature_cvterm.sql")
        feature_cvterm_rows = query.runQuery("feature_cvterm_query", (hits[hit],))
        logger.debug("--- %s" % hit)
        logger.debug('/ortholog="%s"' % hits[hit])
        for row in feature_cvterm_rows:
            cvterm_name = row[0]
            cv_name = row[1]
            if cv_name == "RILEY":
                logger.debug('/class="%s"' % (cvterm_name))
            elif cv_name == "genedb_products":
                logger.debug('/product="%s"' % (cvterm_name))
        # Extract all cvterm relected to a feature_id from featureprop
        query.addQueryFromFile("featureprop_query", "get_cvterm_from_featureprop.sql")
        featureprop_rows = query.runQuery("featureprop_query", (hits[hit],))
        for row in featureprop_rows:
            logger.debug('/%s="%s"' % (row[0], row[1]))
    logger.info("Features transfered")
def explanation_serving_t(train_df):
    data = {'vigil_t': [], 'explanation_serving_time': [], 'l1': [], 'l2': []}
    X_train = train_df[['x', 'y', 'x_range', 'y_range']].values
    y_train = train_df['count'].values
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)

    #Training Models
    logger.info("Model Training Initiation\n=====================")
    kmeans = KMeans(random_state=0)
    mars_ = Earth(feature_importance_type='gcv', )
    vigilance_t = np.linspace(0.01, 3, Config.vigilance_t_frequency)
    for sens_t in vigilance_t:
        logger.info("Sensitivity Level {}".format(sens_t))
        lsnr = PR(mars_, vigil_theta=sens_t)
        lsnr.fit(X_train, y_train)
        for i in range(5):
            q = train_df.iloc[i].values[:4].reshape(1, -1)
            q = sc.transform(q)
            start = time.time()
            m = lsnr.get_model(q)
            end = time.time() - start
            data['vigil_t'].append(sens_t)
            data['explanation_serving_time'].append(end)
            data['l1'].append(lsnr.get_number_of_l1())
            data['l2'].append(lsnr.get_number_of_l2())
    return data
示例#3
0
def del_user(id_tg):
    user = Users.get_or_none(id_tg)
    if user:
        user.delete_by_id(id_tg)
        logger.info(f'Пользователь удален: {user} в {datetime.datetime.now()}')
        return True
    return False
示例#4
0
def say_welcome(message):
    logger.info(f'</code>@{message.from_user.username}<code> ({message.chat.id}) used /start or /help')
    bot.send_message(
        message.chat.id,
        '<b>Hello! This is a telegram bot template written by <a href="https://github.com/otter18">otter18</a></b>',
        parse_mode='html'
    )
    def __init__(self, ytid, chid, func_send, normal_msg=False,
                 save=False, live=True, chat_folder="chat"):
        # main
        self.livechat = LiveChatAsync(ytid, callback=self.post)
        if chid:
            self.id = str(chid) + "." + ytid
        else:
            self.id = ytid

        # discord channel and post function
        self.chid = str(chid)
        self.send = func_send

        # pytchat parameters
        self.ytid = ytid
        self.normal_msg = normal_msg
        self.live = live

        # save the chat
        self.save = save
        self.folder = chat_folder + "/"
        if save:
            os.makedirs(self.folder, exist_ok=True)

        if not self.is_alive():
            raise ValueError("Is not live")
        logger.info(self.id + " is added")
示例#6
0
def fasta2embl(infasta):
    """
    Transform sequence file format in fasta to embl using EMBOSS seqret
    Returns the name of created embl file
    """
    util.checkFile(infasta)
    outembl = infasta.split(".")[0] + ".embl"
    """
    Usage: seqret 
    Online documentation: http://emboss.open-bio.org/wiki/Appdoc:Seqret
    
      Standard (Mandatory) qualifiers:
      [-sequence]          seqall     (Gapped) sequence(s) filename and optional
                                      format, or reference (input USA)
      [-outseq]            seqoutall  [<sequence>.<format>] Sequence set(s)
                                      filename and optional format (output USA)
    
      The basic USA syntax is one of:
        "file"
        "file:entry"
        "format::file"
        "format::file:entry"
        "database:entry"
        "database"
        "@file"
    """
    # Create EMBOSS seqret command line
    cmd = "seqret -sequence fasta::%s -outseq embl::%s " % (infasta, outembl)
    # Call the subprocess using convenience method
    util.runProcess(cmd)
    logger.info("File", outembl, "created")
    return outembl
 def load_state(self, **kwargs):
     if not os.path.exists(self.state_file):
         return
     logger.info(f"Read last state from {self.state_file}")
     for id in open("state"):
         id = id.strip()
         self.add_video(id.split('.')[1], id.split('.')[0], **kwargs)
async def on_ready():
    logger.debug(client.guilds)
    logger.info(f"{client.user} has connected to Discord!")
    # Overwrite the post function after Discord client initized
    for v in chats.videos:
        v.send = discord_notify(int(v.chid))
    await chats.main()
示例#9
0
def add_user(id_tg, name, sub=False):
    user = Users.get_or_none(id_tg)
    if user:
        return user
    user = Users.create(telegram_id=id_tg, name=name, sub=sub)
    logger.info(f'Пользователь создан: {id_tg}-{name}, {user}')
    return user
示例#10
0
def splitSeq(dir, embl, type):
    """
    Split sequence into separate file based on CDS features into dir/ directory
    based on EMBOSS extractfeat
    
    Usage: extractfeat
    Online documentation: http://emboss.open-bio.org/wiki/Appdoc:Extractfeat
    
      Standard (Mandatory) qualifiers:
      [-sequence]          seqall     Sequence(s) filename and optional format, or
                                      reference (input USA)
      [-outseq]            seqout     [.] Sequence filename and
                                      optional format (output USA)
   
      Additional (Optional) qualifiers:
       -type               string     [*] By default every feature in the feature
                                      table is extracted. You can set this to be
                                      any feature type you wish to extract.
                                      See http://www.ebi.ac.uk/Services/WebFeat/
                                      for a list of the EMBL feature types and see
                                      the Uniprot user manual in
                                      http://www.uniprot.org/manual/sequence_annotation
                                      for a list of the Uniprot feature types.
                                      The type may be wildcarded by using '*'.
                                      If you wish to extract more than one type,
                                      separate their names with the character '|',
                                      eg:
                                      *UTR | intron (Any string is accepted)
       -featinname         boolean    [N] To aid you in identifying the type of
                                      feature that has been output, the type of
                                      feature is added to the start of the
                                      description of the output sequence.
                                      Sometimes the description of a sequence is
                                      lost in subsequent processing of the
                                      sequences file, so it is useful for the type
                                      to be a part of the sequence ID name. If
                                      you set this to be TRUE then the name is
                                      added to the ID name of the output sequence.

       Associated qualifiers:
       "-outseq" associated qualifiers
       -ossingle2          boolean    Separate file for each entry
       -ofdirectory2       string     Output directory

      The basic USA syntax is one of:
        "file"
        "file:entry"
        "format::file"
        "format::file:entry"
        "database:entry"
        "database"
        "@file"
    """
    util.checkFile(embl)
    # Create directory
    util.createDir(dir)
    cmd = "extractfeat -sequence embl::%s -type %s -featinname YES -outseq fasta:: -osextension2 ffn -ossingle2 Yes -osdirectory2 %s" % (embl, type, dir)
    util.runProcess(cmd)
    logger.info("Sequences extracted into %s" % dir)
示例#11
0
def say_welcome(message):
    logger.info(
        f'</code>@{message.from_user.username}<code> ({message.chat.id}) used /start or /help'
    )
    bot.send_message(
        message.chat.id, '<b>Коверкает слова. '
        'При добавлении в группу рандомно реагирует на сообщения с негативным смысловым окрасом.</b>',
        parse_mode='html')
示例#12
0
def printMSPCrunch(fasta_hits, reciprocal_hits):
    """
    Print an MSPCrunch format description of the reciprocal hit
    """
    for reciprocal_key in reciprocal_hits.keys():
        if fasta_hits.has_key(reciprocal_key):
            logger.info(fasta_hits[reciprocal_key]) 
    logger.info("MSP Crunch extracted")
async def on_message(message):
    # Only read command exclude bot itself
    if message.author == client.user:
        return
    if not message.content.startswith(".synchat"):
        return

    # if no args
    if not message.content.startswith(".synchat "):
        await message.channel.send("```" + parser.format_help() + "```")
        return

    # read command and videoid
    logger.debug(message.content)
    try:
        args = parser.parse_args(message.content.split()[1:])
    except BaseException as e:
        # Fix this in Python3.9
        logger.warning(str(type(e)) + str(e))
        await message.channel.send("```" + parser.format_help() + "```")
        return

    method, id = args.method, args.id
    dc_channel = message.channel.id

    # list monitor list
    if method == "list":
        ids = [v.ytid for v in chats.videos if v.chid == str(dc_channel)]
        await message.channel.send("sync list: " + ",".join(ids))
        return

    # id cannot be null if user wants to start or stop the chat
    if id is None:
        await message.channel.send("Fail: No video ID provieded")
        return

    # start to monitor
    if method == "start":
        logger.info(f"Sync {id} to {dc_channel}")
        if chats.add_video(id,
                           dc_channel,
                           discord_notify(dc_channel),
                           save=True,
                           chat_folder=chat_folder):
            await message.channel.send(f"OK {id}")
        else:
            await message.channel.send(f"Fail to add {id}")

    # stop monitor
    elif method == "stop":
        ok = await chats.remove_video(id, dc_channel)
        if ok:
            await message.channel.send("OK")
        else:
            await message.channel.send(f"No {id} found")
    else:
        await message.channel.send(f"{method} not implemented")
示例#14
0
def echo(message):
    for t, resp in dialog.items():
        if sum([e in message.text.lower() for e in resp['in']]):
            logger.info(f'</code>@{message.from_user.username}<code> ({message.chat.id}) used {t}:\n\n%s', message.text)
            bot.send_message(message.chat.id, random.choice(resp['out']))
            return

    logger.info(f'</code>@{message.from_user.username}<code> ({message.chat.id}) used echo:\n\n%s', message.text)
    bot.send_message(message.chat.id, message.text)
示例#15
0
def print_samples(sh):
    """Compactly print out contents of a SampleHandler"""
    logger.info('Number of samples: %i', len(sh))
    for s in sh:
        logger.info('Sample: %s', s.name())
        numFiles = s.numFiles()
        logger.info('  Number of files:  %i', numFiles)
        logger.info('  Number of events: %i', s.getNumEntries())
        for i in range(numFiles):
            logger.info('  %s', s.fileName(i))
示例#16
0
def concatSeq(genome_file, dir):
    """
    Concatenate separated CDS sequence fasta files located in dir into one file
    """
    util.checkDir(dir)
    if os.path.exists(genome_file):
        os.remove(genome_file)
    cmd = "cat %s/*.faa > %s" % (dir, genome_file)
    util.runProcess(cmd)
    logger.info("concatSeq finished")
示例#17
0
def runFasta(seq_dir, genomes_dir, fasta_dir):
    """
    Run FASTA on protein sequences between new genome against all in house genomes
    
    FASTA searches a protein or DNA sequence data bank
     version 35.04 Aug. 25, 2009
     W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
    """
    util.createDir(fasta_dir)
    # List of in-house genomes
    util.checkDir(genomes_dir)
    genome_files = []
    logger.info("Create fasta results directory for each in-house reference genome")
    for genome_file in os.listdir(genomes_dir):
        if '.faa' in genome_file:
            genome_files.append(genome_file)
            # Create fasta results directory for each in-house genome
            util.createDir("%s/%s" % (fasta_dir, genome_file.split(".")[0]))
            logger.info(genome_file)

    util.checkDir(seq_dir)
    if IS_LSF:
        # Rename new genome sequences for job array to be mygenome_1.faa mygenome_2.faa ...
        seq_num = 0
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            seq_num += 1
            if 'mygenome_' in seq_file and '.faa' in seq_file:
                continue
            seq_newfilepath = "%s/mygenome_%s.faa" % (seq_dir, seq_num)
            seq_filepath = "%s/%s" % (seq_dir, seq_file)
            os.rename(seq_filepath, seq_newfilepath)
        # Submit bsub job array on mygenome_${LSB_JOBINDEX}.faa against one refgenome at a time
        bsub_dir = "bsub"
        util.checkDir(bsub_dir)
        for genome_file in genome_files:
            res_dir = "%s/%s" % (fasta_dir, genome_file.split(".")[0])
            cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/mygenome_${LSB_JOBINDEX}.faa %s/%s > %s/mygenome_${LSB_JOBINDEX}.fa" % (seq_dir, genomes_dir, genome_file, res_dir)
            util.submitJobArray(jobname="genepy-fasta", jobnum=seq_num, jobdir=bsub_dir, cmd=cmd)
        util.submitJobDependency('genepy-fasta')
        logger.info("Fasta on LSF finished")
    else:
        # List of new genome sequences
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            res_file = seq_file.split(".")[0] + ".fa"
            for genome_file in genome_files:
                res_dir = "%s/%s" % (fasta_dir, genome_file.split(".")[0])
                cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/%s %s/%s > %s/%s" % (seq_dir, seq_file, genomes_dir, genome_file, res_dir, res_file)
                util.runProcess(cmd)
            logger.info(seq_file)
        logger.info("Fasta finished")
    def __preprocessing_theta(self, vigil=.4):
        import warnings
        warnings.filterwarnings('ignore')

        #For each cluster
        for j in self.data_in_clusters_L1:
            #-2 Since each vector is made up of x\inR^d and y
            X = self.data_in_clusters_L1[
                j][:, self.d // 2:-1]  #Only care about clustering thetas
            logger.info("Shape of theta vector {}".format(X.shape))
            c = 1
            if (X.shape[0] > 10):
                #         raise ValueError("Error in support of cluster")
                #Tuning k-parameter for kmeans
                c = 0
                prev_inertia = 0
                init = True
                diff = np.inf
                while diff >= vigil:
                    logger.info("Current diff {0}/{1}".format(diff, vigil))
                    c += 1
                    t_kmeans = KMeans(n_clusters=c, random_state=0)
                    t_kmeans.fit(X)
                    pres_inertia = t_kmeans.inertia_
                    if not init:
                        diff = np.abs(prev_inertia - pres_inertia)
                        prev_inertia = pres_inertia
                    else:
                        prev_inertia = pres_inertia
                        init = False

                if np.unique(t_kmeans.labels_).shape[0] != len(
                        t_kmeans.cluster_centers_):
                    print("Cluster {0}".format(j))
                    c = c - 1
            logger.info("Number of clusters in thetas {}".format(c))
            #     #End of tuning

            CLUSTERS = c
            t_kmeans = KMeans(n_clusters=CLUSTERS, random_state=0)
            t_kmeans.fit(X)

            for i in range(CLUSTERS):
                mask = np.where(t_kmeans.labels_ == i)[0]
                logger.info("Data shape in cluster L1 {}, L2 {} : {}".format(
                    j, i, self.data_in_clusters_L1[j][mask].shape))
                self.data_in_clusters_L2[(
                    j, i)] = self.data_in_clusters_L1[j][mask]
            self.THETA_CENTERS[j] = t_kmeans.cluster_centers_
            logger.info("Shape of theta clusters in L1 {} : {}".format(
                j, self.THETA_CENTERS[j].shape))
def training_time(train_df):
    initial = train_df.head(10000)
    part = train_df.head(5000)
    data = {'size': [], 'time': [], 'l1': [], 'l2': []}
    for i in range(10):
        initial = pd.concat([initial, part])
        for j in range(10):
            t, l1, l2 = execution_time(part)
            data['size'].append(initial.count()[0])
            data['time'].append(t)
            data['l1'].append(l1)
            data['l2'].append(l2)
            logger.info("Loop {}/100".format(j + 10**i))
    return data
示例#20
0
def translateSeq(dir):
    """
    Translate nucleic acid sequence in fasta format into protein sequence using
    EMBOSS transeq
    
    Usage: transeq
    Online documentation: http://emboss.open-bio.org/wiki/Appdoc:Transeq
    
      Standard (Mandatory) qualifiers:
      [-sequence]          seqall     Nucleotide sequence(s) filename and optional
                                      format, or reference (input USA)
      [-outseq]            seqoutall  [.] Protein sequence
                                      set(s) filename and optional format (output
                                      USA)
      Additional (Optional) qualifiers:
       -table              menu       [0] Code to use (Values: 0 (Standard); 1
                                      (Standard (with alternative initiation
                                      codons)); 2 (Vertebrate Mitochondrial); 3
                                      (Yeast Mitochondrial); 4 (Mold, Protozoan,
                                      Coelenterate Mitochondrial and
                                      Mycoplasma/Spiroplasma); 5 (Invertebrate
                                      Mitochondrial); 6 (Ciliate Macronuclear and
                                      Dasycladacean); 9 (Echinoderm
                                      Mitochondrial); 10 (Euplotid Nuclear); 11
                                      (Bacterial); 12 (Alternative Yeast Nuclear);
                                      13 (Ascidian Mitochondrial); 14 (Flatworm
                                      Mitochondrial); 15 (Blepharisma
                                      Macronuclear); 16 (Chlorophycean
                                      Mitochondrial); 21 (Trematode
                                      Mitochondrial); 22 (Scenedesmus obliquus);
                                      23 (Thraustochytrium Mitochondrial))

      The basic USA syntax is one of:
        "file"
        "file:entry"
        "format::file"
        "format::file:entry"
        "database:entry"
        "database"
        "@file"
    """ 
    util.checkDir(dir)
    for file in os.listdir(dir):
        if '.ffn' in file:
            infasta = file
            outpep = file.split(".")[0] + ".faa"
            cmd = "transeq -sequence fasta::%s/%s -outseq fasta::%s/%s -table 11" % (dir, infasta, dir, outpep)
            util.runProcess(cmd)
    logger.info("Sequences translated.")
def execution_time(train_df):
    X_train = train_df[['x', 'y', 'x_range', 'y_range']].values
    y_train = train_df['count'].values
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    #Training Models
    logger.info("Model Training Initiation\n=====================")
    kmeans = KMeans(random_state=0)
    mars_ = Earth(feature_importance_type='gcv', )

    lsnr = PR(mars_, vigil_x=0.01)
    start = time.time()
    lsnr.fit(X_train, y_train)
    return (time.time() - start, lsnr.get_number_of_l1(),
            lsnr.get_number_of_l2())
示例#22
0
def getHits(fasta_hits, reciprocal_hits):
    """
    Return two dictionaries of ortholog hits and similarity hits containing
    {'new_genome_CDS_name':inhouse_genome_feature_id}
    """
    ortholog_hits = {}
    for reciprocal_key in reciprocal_hits.keys():
        if fasta_hits.has_key(reciprocal_key):
            ortholog_hits[reciprocal_key.split("||")[0]] = reciprocal_key.split("||")[1]
            del fasta_hits[reciprocal_key]
    similarity_hits = {}
    for fasta_key in fasta_hits:
        new_genome_key = fasta_key.split("||")[0]
        if not ortholog_hits.has_key(new_genome_key):
            similarity_hits[new_genome_key] = fasta_key.split("||")[1]
    return {'ortholog':ortholog_hits, 'similarity':similarity_hits}
    logger.info("Hits processed")
    def __fit_models(self):
        #Fit an Earth model for each cluster
        for l1, l2 in self.data_in_clusters_L2:
            tcluster = self.data_in_clusters_L2[(l1, l2)]
            XX = tcluster[:, :self.d]
            logger.info("Shape of Training data {}".format(XX.shape))
            yy = tcluster[:, -1]
            try:
                estimator = deepcopy(self.learning_algorithm)

                # model = Earth(max_degree=1, feature_importance_type='gcv')
                estimator.fit(XX, yy)
            except ValueError as e:
                print((i, j))
                print(e)
                raise ValueError

            self.final_product[(l1, l2)] = estimator
示例#24
0
def concatFeatures(embl, features):
    """
    Concat CDS features in embl format into embl sequence file 
      - the first two lines of embl sequence containing ID & XX lines 
      - the CDS features file containing FT lines
      - the rest of embl sequence containing SQ lines
    Returns the name of created embl sequence file
    """
    util.checkFile(embl)
    util.checkFile(features)
    outembl = embl.split(".")[0] + "_with_cds.embl"
    # Create command line
    head_cmd = "head -2 %s > %s; cat %s >> %s;" % (embl, outembl, features, outembl)
    util.runProcess(head_cmd)
    tail_cmd = "tail +3 %s > tail; cat tail >> %s; rm tail;" % (embl, outembl)
    util.runProcess(tail_cmd)
    logger.info("File", outembl, "created")
    return outembl
def execution_varying(train_df, L1, L2):
    X_train = train_df[['x', 'y', 'x_range', 'y_range']].values
    y_train = train_df['count'].values
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    #Training Models
    logger.info("Model Training Initiation\n=====================")
    split = int(X_train.shape[0] / (L1 * L2))
    mars_ = Earth()
    start = time.time()
    kmeans = KMeans(n_clusters=L1, random_state=0)
    kmeans.fit(X_train[:, :2])
    l2_kmeans = KMeans(n_clusters=L2, random_state=0)
    for _ in range(L1):
        l2_kmeans.fit(X_train[:, 2:])
        for _ in range(L2):
            mars_.fit(X_train[:split, :], y_train[:split])
    return (time.time() - start)
示例#26
0
def splitSeqWithBiopython(embl, type):
    """
    Split sequence into separate file based on CDS features into sequences/ directory
    using Biopython
    
    """
    util.checkFile(embl)
    # Create directory sequences/
    dirname = "sequences/"
    util.createDir(dirname)
    record = SeqIO.read(open(embl, "rU"), "embl")
    if len(record.features) == 0:
        sys.exit("ERROR: EMBL file %s without features" % embl)
    for feature in record.features:
        if feature.type == 'CDS':
            seq = record.seq
            
            # Build up a list of (start,end) tuples that will be used to slice the sequence
            locations = []
            # If there are sub_features, then this gene is made up of multiple parts.  
            if len(feature.sub_features): 
                for sf in feature.sub_features:
                    locations.append((sf.location.start.position, sf.location.end.position))
            # This gene is made up of one part.  Store its start and end position.
            else:
                locations.append((feature.location.start.position, feature.location.end.position))

            # Store the joined sequence and nucleotide indices forming the CDS.
            seq_str = '' 
            for begin, end in locations:
                seq_str += seq[begin:end].tostring()

            # Reverse complement the sequence if the CDS is on the minus strand  
            if feature.strand == -1:  
                seq_obj = Seq(seq_str, IUPAC.ambiguous_dna)
                seq_str = seq_obj.reverse_complement().tostring()
            
            logger.debug(feature)
            logger.debug(SeqRecord(seq=Seq(seq_str), id=feature.qualifiers['systematic_id'][0], description=feature.type).format('fasta'))
              
    logger.info("Sequences extracted into %s" % dirname) 
示例#27
0
def topReciprocalFastaHits(res_dir):
    """
    Extract top hits that cover at least 80% of the length of both sequences
    with at least 30% identity.
    Returns a dictionary of hits
    """
    # Identity cutoff for reciprocal searches
    ident_cutoff  = 0.3;
    # Length of hit cutoff for reciprocal searches
    len_cutoff = 0.8;
    # TODO Create MSP crunch file
    # Top hits dictionnary
    fastahits_dict = {}
    # Loop over the fasta results
    util.checkDir(res_dir)
    for (path, dirs, files) in os.walk(res_dir):
        for file in files:
            if not '.fa' in file:
                continue
            res_file = path + "/" + file
            logger.info("Reading... " +  res_file)
            # Read the fasta alignment results with biopython AlignIO fasta-m10     
            alignments = AlignIO.parse(open(res_file), "fasta-m10", seq_count=2)
            for alignment in alignments:
                # Select the hit based on cutoffs
                if float(alignment._annotations["sw_ident"]) < ident_cutoff:
                    continue
                record_query = alignment[0]
                record_match = alignment[1]
                overlap = float(alignment._annotations["sw_overlap"])
                if overlap/float(record_query.annotations["original_length"]) < len_cutoff and overlap/float(record_match.annotations["original_length"]) < len_cutoff:
                    continue
                
                record_query_region = "%s-%s" % (record_query._al_start, record_query._al_stop)
                record_match_region = "%s-%s" % (record_match._al_start, record_match._al_stop)
                # add hit into dictionnary
                key = "%s||%s" % (record_match.id, record_query.id) # inverted key to be comparable with fasta hits
                value = "%s %s %s %s %s %s" % (alignment._annotations["sw_score"], alignment._annotations["sw_ident"], record_query_region, record_query.id, record_match_region, record_match.id)
                fastahits_dict[key] = value
    logger.info("Extract reciprocal fasta alignment hits finished")
    return fastahits_dict
示例#28
0
def chadoDump(dir):
    """
    Dump the polypeptide sequences of all organisms stored in geneDB/chado in FASTA format
    """
    util.createDir(dir)
    # Connect to geneDB as read only user using ropy.query
    query = ropy.query.QueryProcessor(connection=connectionFactory)
    query.setSQLFilePath(os.path.dirname(__file__) + "/sql/")
    
    # List of organisms
    query.addQueryFromFile("organism_query", "get_all_organisms_with_polyseq.sql")
    organism_rows = query.runQuery("organism_query")
    logger.info("Extracting %s organism sequences from geneDB. Please wait..." % len(organism_rows))
    
    # Add fasta query
    query.addQueryFromFile("fasta_query", "get_fasta_polyseq_for_organism.sql")
    
    for organism in organism_rows:
        organism_name = organism[1]
        organism_id = organism[0]
        if organism_name == "dummy":
            continue
        
        # Dump sequence of each organism into a fasta file
        logger.info("Extracting %s..." % organism_name)
        fasta_rows = query.runQuery("fasta_query", (organism_id, ))
        file_path = "%s/%s_%s.faa" % (dir, organism_id, organism_name)
        out = open(file_path, 'w')
        for row in fasta_rows:
            if not row[0] == None:
                out.write(row[0])
                out.write("\n")
        out.close()
        logger.info("    ...sequence extracted into %s." % file_path)
示例#29
0
def runHamapScan(seq_dir, hamap_dir):
    """
    HAMAP: High-quality Automated and Manual Annotation of microbial Proteomes
    ftp download site: ftp://ftp.expasy.org/databases/hamap/
     
    pfscan compares a protein or nucleic acid sequence against a profile 
    library. The result is an unsorted list of profile-sequence matches.
    download site: http://www.isrec.isb-sib.ch/ftp-server/pftools/pft2.3/
    """
    util.createDir(hamap_dir)
    util.checkDir(seq_dir)
    hamap_profile_file = "%s/hamap/hamap.prf" % os.path.dirname(__file__)
    if IS_LSF:
        # Rename new genome sequences for job array to be mygenome_1.faa mygenome_2.faa ...
        seq_num = 0
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            seq_num += 1
            if 'mygenome_' in seq_file and '.faa' in seq_file:
                continue
            seq_newfilepath = "%s/mygenome_%s.faa" % (seq_dir, seq_num)
            seq_filepath = "%s/%s" % (seq_dir, seq_file)
            os.rename(seq_filepath, seq_newfilepath)
        # Submit bsub job array on mygenome_${LSB_JOBINDEX}.faa against hamap profile
        bsub_dir = "bsub"
        util.checkDir(bsub_dir)
        cmd = "pfscan -klf %s/mygenome_${LSB_JOBINDEX}.faa %s > %s/mygenome_${LSB_JOBINDEX}.out" % (seq_dir, hamap_profile_file, hamap_dir)
        util.submitJobArray(jobname='genepy-hamap', jobnum=seq_num, jobdir=bsub_dir, cmd=cmd)
        util.submitJobDependency('genepy-hamap')
        logger.info("HAMAP scan on LSF finished")
    else:
        # List of new genome sequences
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            res_file = seq_file.split(".")[0] + ".out"
            cmd = "pfscan -klf %s/%s %s > %s/%s" % (seq_dir, seq_file, hamap_profile_file, hamap_dir, res_file)
            util.runProcess(cmd)
        logger.info("HAMAP scan finished")
    def __preprocessing_x(self, X, y, vigil=.05):
        # #Tuning k-parameter for kmeans
        c = 0
        prev_inertia = 0
        pres_inertia = 0
        init = True
        diff = np.inf
        X_ = X[:, :self.d // 2]
        logger.info("Shape of X in preprocessing x is : {}".format(X_.shape))
        while diff >= vigil:
            logger.info("Current diff {0}/{1}".format(diff, vigil))
            c += 1
            kmeans = KMeans(n_clusters=c, random_state=0)
            kmeans.fit(X_)
            pres_inertia = kmeans.inertia_
            if not init:
                diff = np.abs(prev_inertia - pres_inertia)
                prev_inertia = pres_inertia
            else:
                prev_inertia = pres_inertia
                init = False
        # #End of tuning
        logger.info("Number of clusters in X : {}".format(c))
        CLUSTERS = c
        kmeans = KMeans(n_clusters=CLUSTERS)
        kmeans.fit(X_)

        #Assigning to clusters
        for i in np.unique(kmeans.labels_):
            mask = np.where(kmeans.labels_ == i)
            self.data_in_clusters_L1[i] = np.column_stack((X[mask], y[mask]))
            logger.info("Data shape in cluster {} : {}".format(
                i, self.data_in_clusters_L1[i].shape))
        self.CLUSTER_CENTERS = kmeans.cluster_centers_
        logger.info("Cluster centers shape {}".format(
            self.CLUSTER_CENTERS.shape))
示例#31
0
def topFastaHits(res_dir, extractedseq_dir):
    """
    Extract top fasta alignment hits that cover at least 80% of the length of 
    both sequences with at least 30% identity.
    Creates an in-house fasta sequence file for each hit
    Returns a dictionnary of hits
    """
    # Identity cutoff for reciprocal searches
    ident_cutoff  = 0.3;
    # Length of hit cutoff for reciprocal searches
    len_cutoff = 0.8;
    # Extracted sequence directory
    util.createDir(extractedseq_dir)
    # TODO Create MSP crunch file
    # Top hits dictionnary
    fastahits_dict = {}
    # Loop over the fasta results
    util.checkDir(res_dir)
    for (path, dirs, files) in os.walk(res_dir):
        for file in files:
            if not '.fa' in file:
                continue
            res_file = path + "/" + file
            logger.info("Reading... " +  res_file)
            # Read the fasta alignment results with biopython AlignIO fasta-m10     
            alignments = AlignIO.parse(open(res_file), "fasta-m10", seq_count=2)
            for alignment in alignments:
                # Select the hit based on cutoffs
                if float(alignment._annotations["sw_ident"]) < ident_cutoff:
                    continue
                record_query = alignment[0]
                record_match = alignment[1]
                overlap = float(alignment._annotations["sw_overlap"])
                if overlap/float(record_query.annotations["original_length"]) < len_cutoff and overlap/float(record_match.annotations["original_length"]) < len_cutoff:
                    continue
                # Create SeqRecord of selected hit
                extractedseq_record = SeqRecord(seq=Seq(str(record_match.seq).replace('-', '')), id=record_match.id, description=res_file)
                extractedseq_file = "%s/%s.faa" % (extractedseq_dir, record_match.id)
                # Print match sequence of selected hit into fasta file
                output_handle = open(extractedseq_file, "w")
                SeqIO.write([extractedseq_record], output_handle, "fasta")
                output_handle.close()
                logger.info("    ...sequence extracted into %s" % extractedseq_file)
                record_query_region = "%s-%s" % (record_query._al_start, record_query._al_stop)
                record_match_region = "%s-%s" % (record_match._al_start, record_match._al_stop)
                # add hit into dictionnary
                key = "%s||%s" % (record_query.id, record_match.id)
                # value in MSP crunch format
                value = "%s %s %s %s %s %s" % (alignment._annotations["sw_score"], alignment._annotations["sw_ident"], record_query_region, record_query.id, record_match_region, record_match.id)
                fastahits_dict[key] = value
    logger.info("Extract fasta alignment hits finished")
    return fastahits_dict
def generate_subqueries_for_files():
    directory = os.fsencode('input/Crimes_Workload')
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.startswith('test') and 'subqueries_{}'.format(filename) not in existing:
            logger.info("Loading Workload : {}".format(filename))
            df = pd.read_csv('input/Crimes_Workload/{}'.format(filename), index_col=0)
            global std
            std = df[['x','y','x_range','y_range']].std().values
            pertubations = Parallel(n_jobs=4, verbose=2)(delayed(get_pertubations)(sq)
                                                                                for sq in df.values[:1000,:])
            pertubations = np.array(pertubations)
            logger.info("Saving file {}".format(filename))
            np.save('input/Subqueries/subqueries_{}'.format(filename),pertubations)
        else:
            logger.info("Skipping {}".format(filename))
示例#33
0
def runReciprocalFasta(seq_dir, genome_file, fasta_dir):
    """
    Run FASTA between extracted in-house protein sequences against new genome 
    
    FASTA searches a protein or DNA sequence data bank
     version 35.04 Aug. 25, 2009
     W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
    """
    util.createDir(fasta_dir)
    # Check new genome
    util.checkFile(genome_file)
    # Check ref genome extracted sequences
    util.checkDir(seq_dir)
    res_dir = fasta_dir
    if IS_LSF:
        # Rename new genome sequences for job array to be refgenome_1.faa refgenome_2.faa ...
        seq_num = 0
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            seq_num += 1
            if 'refgenome_' in seq_file and '.faa' in seq_file:
                continue
            seq_newfilepath = "%s/refgenome_%s.faa" % (seq_dir, seq_num)
            seq_filepath = "%s/%s" % (seq_dir, seq_file)
            os.rename(seq_filepath, seq_newfilepath)
        # Submit bsub job array on refgenome_${LSB_JOBINDEX}.faa against mygenome
        bsub_dir = "bsub"
        util.checkDir(bsub_dir)
        cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/refgenome_${LSB_JOBINDEX}.faa %s > %s/refgenome_${LSB_JOBINDEX}.fa" % (seq_dir, genome_file, res_dir)
        util.submitJobArray(jobname="genepy-recipfasta", jobnum=seq_num, jobdir=bsub_dir, cmd=cmd)
        util.submitJobDependency('genepy-recipfasta')
        logger.info("Reciprocal Fasta on LSF finished")
    else:
        # List of inhouse extracted genome sequences
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            res_file = seq_file.split(".")[0] + ".fa"
            cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/%s %s > %s/%s" % (seq_dir, seq_file, genome_file, res_dir, res_file)
            util.runProcess(cmd)
            logger.info(seq_file)
        logger.info("Reciprocal Fasta finished")
def load_data():
    logger.info("Loading Data...")
    data = pd.read_csv('input/Crimes_-_2001_to_present.csv', header=0)
    global dd
    dd = data[['X Coordinate', 'Y Coordinate', 'Arrest', 'Beat']]
async def console_print(c):
    if type(c) is str:
        logger.info(c)
    else:
        logger.debug(f"Print data: {str(c.json())}")
 async def close(self):
     logger.info(self.id + " to stopped")
     await self.send(f"{self.ytid} is stopped")
     self.livechat.terminate()
    def show_status(self):
        logger.info("check: " + ",".join([i.id for i in self.videos]))

        # save state to file
        if self.state:
            self.write_state()
parser.add_argument("--verbose",
                    dest='verbosity',
                    help="increase output verbosity",
                    action="store_true")
parser.add_argument('-v',
                    help='verbosity',
                    dest='verbosity',
                    action="store_true")
args = parser.parse_args()
if args.verbosity:
    print("verbosity turned on")
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.DEBUG)
    logger.addHandler(handler)
if not os.path.exists('output/Accuracy'):
    logger.info('creating directory Accuracy')
    os.makedirs('output/Accuracy')


def kl_divergence_error(y, y_hat):
    kd = KernelDensity(bandwidth=0.75).fit(y.reshape(-1, 1))
    yp = kd.score_samples(y.reshape(-1, 1))
    kd = KernelDensity(bandwidth=0.75).fit(y_hat.reshape(-1, 1))
    ypg = kd.score_samples(y_hat.reshape(-1, 1))
    return entropy(yp, ypg)


def model_based_divergence(X, y, model_2):
    model_1 = Earth(feature_importance_type='gcv')
    model_1.fit(X, y)
    features_l = model_1.feature_importances_
示例#39
0
import logging
import sys
from setup import logger
from PyQt5 import QtWidgets

from app.main_window import MainWindow
from db import db

if __name__ == "__main__":
    logger.setLevel(logging.DEBUG)
    logger.info('App start')

    app = QtWidgets.QApplication(sys.argv)

    main_window = None
    try:
        db.connect()
        db.create_tables([])  # fixme

        main_window = MainWindow(app)
        main_window.show()
        sys.exit(app.exec())
    except Exception as e:
        logger.exception(e)
    finally:
        db.close()  # todo check
        if main_window:
            main_window.stop()
        logger.info('App close')
import pandas as pd
import numpy as np
import os
import sys

os.chdir("../../../explanation_framework")
sys.path.append("../explanation_framework")
sys.path.append('utils')

from terminal_outputs import printProgressBar
from confs import Config
from setup import logger
from joblib import Parallel, delayed

if not os.path.exists('input/Subqueries'):
        logger.info('creating directory Subqueries')
        os.makedirs('input/Subqueries')
existing = set(os.listdir('input/Subqueries'))
DIM = 2
NSUBQUERIES = Config.NSUBQUERIES

def load_data():
    logger.info("Loading Data...")
    data = pd.read_csv('input/Crimes_-_2001_to_present.csv', header=0)
    global dd
    dd = data[['X Coordinate', 'Y Coordinate', 'Arrest', 'Beat']]

def vectorize_query(q):
    # res = dd_matrix[np.all((dd_matrix[:,:2]>q[:,:DIM]-q[:,DIM:2*DIM]) & (dd_matrix[:,:2]<q[:,:DIM]+q[:,DIM:2*DIM]),axis=1)]
    res = dd[(dd['X Coordinate']>float(q[:,0]-q[:,2])) & (dd['X Coordinate']<float(q[:,0]+q[:,2])) & (dd['Y Coordinate']>float(q[:,1]-q[:,3])) & (dd['Y Coordinate']<float(q[:,1]+q[:,3]))].values
    return np.array([res.shape[0], np.sum(res[:,2]),np.mean(res[:,3])]) if res.shape[0]!=0 else np.zeros(3)
示例#41
0
def main():
    # Fasta file extension: 
    # .ffn for the untranslated nucleotide sequences for each CDS; .faa for protein coding sequences (CDS)
    # .fa for the fasta alignment results
    # .fna for whole genomic DNA sequences; .frn for nucleotide sequences of RNA related features
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-d", "--dna", metavar="FILE", help="input dna FILE in fasta format", action="store", type="string", dest="dna")
    parser.add_option("-t", "--tab", metavar="FILE", help="input tab FILE in embl format", action="store", type="string", dest="tab")
    parser.add_option("-e", "--embl", metavar="FILE", help="input embl FILE with CDS features in embl format", action="store", type="string", dest="embl")
    parser.add_option("--genedb", help="extract reference genome protein sequences from geneDB", action="store_true", dest="db")
    parser.add_option("--fasta", help="run fasta against each extracted in-house genomes", action="store_true", dest="fasta")
    parser.add_option("--hamap", help="run pfscan against HAMAP profiles", action="store_true", dest="hamap")
    parser.add_option("--clean", help="delete all results without deleting reference genomes", action="store_true", dest="clean")
    parser.add_option("--deepclean", help="delete all reference genomes and results", action="store_true", dest="deepclean")
    (options, args) = parser.parse_args()
    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
    # Print command line
    cmdline = "$ python "
    for argv in sys.argv:
        cmdline += argv + " " 
    logger.debug(cmdline)
    
    # >>> ---------------------------------------------------------------------
    # >>> DATA PREPARATION
    # >>> ---------------------------------------------------------------------
    # List of needed software
    for softname in soft_lists:
        util.checkSoft(softname)
    # Prepare new genome data
    if options.dna and options.tab and not options.embl:
        util.checkFile(options.dna)
        mygenome_emblfile = fasta2embl(options.dna)
        mygenome_emblfile_withcds = concatFeatures(mygenome_emblfile, options.tab)
        splitSeq(mygenome_dir, mygenome_emblfile_withcds, "CDS")
        translateSeq(mygenome_dir)
    elif not options.dna and not options.tab and options.embl:
        mygenome_emblfile_withcds = options.embl
        splitSeq(mygenome_dir, mygenome_emblfile_withcds, "CDS")
        #splitSeqWithBiopython(mygenome_emblfile_withcds, "CDS") # does not work with testdata_01
        translateSeq(mygenome_dir)
    elif not options.deepclean:
        util.checkDir(mygenome_dir)
    # Extract in house genomes from chado db
    if options.db:
        chadoDump(refgenomes_dir)
    elif not options.deepclean:
        util.checkDir(refgenomes_dir)
    # bsub output directory
    if IS_LSF and not (options.clean or options.deepclean):
        util.createDir(bsub_dir)

    # >>> ---------------------------------------------------------------------
    # >>> ORTHOLOG SEARCH
    # >>> ---------------------------------------------------------------------
    # Run fasta & reciprocal fasta
    if options.fasta:
        runFasta(mygenome_dir, refgenomes_dir, fasta_dir)
        fasta_hits = topFastaHits(fasta_dir, refgenomes_extractedseq_dir)
        concatSeq(mygenome_fastafile_allcds, mygenome_dir)
        runReciprocalFasta(refgenomes_extractedseq_dir, mygenome_fastafile_allcds, reciprocalfasta_dir)
        reciprocalfasta_hits = topReciprocalFastaHits(reciprocalfasta_dir)
        printMSPCrunch(fasta_hits, reciprocalfasta_hits)
        hits = getHits(fasta_hits, reciprocalfasta_hits)
        logger.info("ORTHOLOGS")
        logger.info(hits['ortholog'])
        logger.info("SIMILARITY")
        logger.info(hits['similarity'])
        transferFeatures(hits['ortholog'])
    # Run hamap scan
    if options.hamap:
        runHamapScan(mygenome_dir, hamap_dir)

    # >>> ---------------------------------------------------------------------
    # >>> CLEANING OUTPUT DATA
    # >>> ---------------------------------------------------------------------
    # Clean results before a re-run
    if options.clean:
        # fasta results
        util.rmDir(fasta_dir)
        util.rmDir(reciprocalfasta_dir)
        util.rmDir(refgenomes_extractedseq_dir)
        util.rmFile(mygenome_fastafile_allcds)
        # hamap results
        util.rmDir(hamap_dir)
        # bsub outputs
        if IS_LSF:
            util.rmDir(bsub_dir)
    # Deep clean - remove all
    if options.deepclean:
        util.rmDir(refgenomes_dir)
        util.rmDir(mygenome_dir)
        util.rmDir(fasta_dir)
        util.rmDir(reciprocalfasta_dir)
        util.rmDir(refgenomes_extractedseq_dir)
        util.rmFile(mygenome_fastafile_allcds)
        util.rmDir(hamap_dir)
示例#42
0
def accuracy_on_crimes():
    logger.info("Finding datasets...")
    directory = os.fsencode('input/Crimes_Workload')
    directory_sub = os.fsencode('input/Subqueries/')
    patterns = {'gauss-gauss': '*x-gauss*-length-gauss*',
               'gauss-uni': '*x-gauss*-length-uniform*',
               'uni-gauss': '*x-uniform*-length-gauss*',
               'uni-uni': '*x-uniform*-length-uniform*',}
    train_datasets = {}
    test_datasets = {}
    sub_datasets = {}

    for p in patterns:
        res = [os.fsdecode(n) for n in os.listdir(directory) if fnmatch.fnmatch(os.fsdecode(n), patterns[p])]
        train_datasets[p] = res[0] if res[0].startswith('train') else res[1]
        test_datasets[p] = res[0] if res[0].startswith('test') else res[1]
        sub_datasets[p] = [os.fsdecode(n) for n in os.listdir(directory_sub) if fnmatch.fnmatch(os.fsdecode(n), patterns[p])][0]

    res_eval = {'model': [],
               'dataset': [],
               'aggregate_name': [],
               'kl': [],
               'r2':[],
               'md':[],
               'nrmse':[]}
    #Main
    for p in patterns:
        logger.info('Beginning Evaluation for {0}'.format(p))
        logger.info('Loading Datasets...')

        test_df = pd.read_csv('/home/fotis/dev_projects/explanation_framework/input/Crimes_Workload/{0}'.format(test_datasets[p]), index_col=0)
        train_df = pd.read_csv('/home/fotis/dev_projects/explanation_framework/input/Crimes_Workload/{0}'.format(train_datasets[p]), index_col=0)
        sub = np.load('/home/fotis/dev_projects/explanation_framework/input/Subqueries/{0}'.format(sub_datasets[p]))

        logger.info('Finished loading\nCommencing Evaluation')
        aggregates = ['count','sum_','avg']
        agg_map = {'count' :4, 'sum_':5, 'avg':6}
        for agg in aggregates:
            logger.info("Evaluating Aggregates : {0}".format(agg))
            X_train = train_df[['x','y','x_range','y_range']].values
            y_train = train_df[agg].values
            sc = StandardScaler()
            sc.fit(X_train)
            X_train = sc.transform(X_train)
            #Training Models
            logger.info("Model Training Initiation\n=====================")
            kmeans = KMeans()
            lr = Ridge()

            lsnr = PR(lr)
            lsnr.fit(X_train,y_train)

            lr_global = LinearRegression()
            lr_global.fit(X_train, y_train)

            logger.info("Accuracy Evaluation on Test set\n=====================")
            for i in range(1000):
                #Obtain query from test-set
                dataset = p
                printProgressBar(i, 1000,prefix = 'Progress:', suffix = 'Complete', length = 50)

                q = test_df.iloc[i].values[:4].reshape(1,-1)
                q = sc.transform(q)
                #Obtain subquery pertubations for query q from test set
                q1 = sub[i]
                X = q1[:,:4]
                y = q1[:,agg_map[agg]]
                X = sc.transform(X)
                # Train local model (Should be the best out of the 3)
                lr = LinearRegression()
                lr.fit(X,y)
                y_hat = lr.predict(X)
                metrics_for_model('local',dataset,agg,y_hat,X, y, lr,res_eval)

                #Obtain metrics for our
                y_hat_s = lsnr.get_model(q).predict(X)
                metrics_for_model('ours',dataset,agg,y_hat_s,X,y,lsnr.get_model(q) ,res_eval)


                #Obtain metrics for global
                y_hat_g = lr_global.predict(X)
                metrics_for_model('global',dataset,agg,y_hat_g,X,y,lr_global,res_eval)
            logger.info("Finished Queries")
    eval_df = pd.DataFrame(res_eval)
    eval_df.to_csv('output/Accuracy/evaluation_results_linear.csv')
示例#43
0
gattr_to_table_map = {
    key: value
    for key, value in zip(df['column_name'].values, df['table_name'].values)
}
print(gattr_to_table_map)
#print(attrs_array)
# attrs_dict = { key : [] for key in attrs_array } #dict.fromkeys(attrs_array,[[]]*len(attrs_array))
distinct_attr = {}
i = 0
qdf = None
j = 0
tot_query_answering_time = 0
start = time.time()
for qname, q in queries:
    logger.info("Query :\n{}\n".format(q))
    ####Execute Query and obtain result
    start_query = time.time()
    cur.execute(q)
    tot_query_answering_time += (time.time() - start_query)
    res = cur.fetchall()
    res_df = pd.DataFrame(res)
    res_df = res_df.set_index(np.arange(i, i + res_df.shape[0]))
    if res_df.empty:
        logger.debug("Query is empty")
        j += 1
        continue
    pr = Parser()
    qv = QueryVectorizer(set(df['column_name'].tolist()))
    #Begin parsing the query and vectorizing its parameters
    pr.parse(q)
                    dest='verbosity',
                    help="increase output verbosity",
                    action="store_true")
parser.add_argument('-v',
                    help='verbosity',
                    dest='verbosity',
                    action="store_true")

args = parser.parse_args()
if args.verbosity:
    print("verbosity turned on")
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.DEBUG)
    logger.addHandler(handler)
if not os.path.exists('output/Performance'):
    logger.info('creating directory Performance')
    os.makedirs('output/Performance')


def execution_time(train_df):
    X_train = train_df[['x', 'y', 'x_range', 'y_range']].values
    y_train = train_df['count'].values
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    #Training Models
    logger.info("Model Training Initiation\n=====================")
    kmeans = KMeans(random_state=0)
    mars_ = Earth(feature_importance_type='gcv', )

    lsnr = PR(mars_, vigil_x=0.01)
示例#45
0
parser.add_argument("--verbose", dest='verbosity', help="increase output verbosity",
                    action="store_true")
parser.add_argument('-v',help='verbosity',dest='verbosity',action="store_true")
parser.add_argument("--crimes",dest="crimes", action="store_true")
parser.add_argument("--higgs",dest="higgs", action="store_true")
parser.add_argument("--accelerometer",dest="accelerometer", action="store_true")

args = parser.parse_args()

if args.verbosity:
   print("verbosity turned on")
   handler = logging.StreamHandler(sys.stdout)
   handler.setLevel(logging.DEBUG)
   logger.addHandler(handler)
if not os.path.exists('output/Accuracy'):
        logger.info('creating directory Accuracy')
        os.makedirs('output/Accuracy')
if not (args.crimes or args.higgs or args.accelerometer):
    logger.info("No data set specified")
    sys.exit()


def kl_divergence_error(y, y_hat):
    kd = KernelDensity(bandwidth=0.75).fit(y.reshape(-1,1))
    yp = kd.score_samples(y.reshape(-1,1))
    kd = KernelDensity(bandwidth=0.75).fit(y_hat.reshape(-1,1))
    ypg = kd.score_samples(y_hat.reshape(-1,1))
    return entropy(yp,ypg)

def model_based_divergence(X,y, model_2):
    model_1 = LinearRegression()# Earth(feature_importance_type='gcv')
示例#46
0
def accuracy_on_higgs():
    logger.info("Starting Accuracy Tests on Higgs")
    logger.info("================================")
    df = pd.read_csv('input/sample_higgs_0.01.csv', index_col=0)
    X = df[['m_bb','m_wwbb']].dropna().values
    y = df['label']
    min_ = np.min(X, axis=0)
    max_ = np.max(X, axis=0)
    X = (X-min_) / (max_-min_)
    data = np.column_stack((X,y))
    x = np.linspace(0.1,0.9,7)
    xx,yy = np.meshgrid(x,x)
    DIMS = X.shape[1]
    cov = np.identity(DIMS)*0.001
    cluster_centers = np.column_stack((xx.ravel(),yy.ravel()))
    query_centers = []
    #Generate queries over cluster centers
    for c in cluster_centers:
        queries = np.random.multivariate_normal(np.array(c), cov, size=40)
        query_centers.append(queries)
    query_centers = np.array(query_centers).reshape(-1,DIMS)

    ranges = np.random.uniform(low=0.005**(1/3), high=0.25**(1/3), size=(query_centers.shape[0], DIMS))
    queries = []
    empty = 0
    for q,r in zip(query_centers,ranges):
            b = generate_boolean_vector(data,q,r,2)
            res = data[b]
            if res.shape[0]==0:
                empty+=1

            ans = float(np.mean(res[:,-1])) if res.shape[0]!=0 else 0
            qt = q.tolist()
            qt += r.tolist()
            qt.append(ans)
            queries.append(qt)
    qs = np.array(queries).reshape(-1, 2*DIMS+1)
    X_train, X_test, y_train, y_test = train_test_split(
         qs[:,:qs.shape[1]-1], qs[:,-1], test_size=0.4, random_state=0)
    earth  = Earth()
    lsnr = PR(earth)
    lsnr.fit(X_train, y_train)
    y_hat = np.array([float(lsnr.get_model(x.reshape(1,-1)).predict(x.reshape(1,-1))) for x in X_test])
    r2 = metrics.r2_score(y_test,y_hat)
    kl = kl_divergence_error(y_test, y_hat)
    nrmse = np.sqrt(metrics.mean_squared_error(y_test, y_hat))/np.mean(y_test)
    logger.info("R2 Score : {}\nNRMSE : {}\nKL-Divergence : {}".format(r2, nrmse, kl))
    #Linear Regression comparsion
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_hat_lr = lr.predict(X_test)
    r2_lr = metrics.r2_score(y_test, y_hat_lr)
    kl_lr = kl_divergence_error(y_test, y_hat_lr)
    nrmse_lr = np.sqrt(metrics.mean_squared_error(y_test, y_hat_lr))/np.mean(y_test)
    logger.info("R2 Score : {}\nNRMSE : {}\nKL-Divergence : {}".format(r2_lr, kl_lr, nrmse_lr))
    dic = {}
    dic['LPM' ]= [('r2',r2), ('kl',kl), ('nrmse',nrmse)]
    dic['LR'] = [('r2',r2_lr), ('kl',kl_lr), ('nrmse',nrmse_lr)]
    #Polynomial regression comparsion
    for count, degree in enumerate(np.arange(3,10,2)):
         model = make_pipeline(PolynomialFeatures(degree), Ridge())
         model.fit(X_train, y_train)
         y_hat = model.predict(X_test)
         r2_p = metrics.r2_score(y_test,y_hat)
         kl_p = kl_divergence_error(y_test, y_hat)
         nrmse_p = np.sqrt(metrics.mean_squared_error(y_test, y_hat))/np.mean(y_test)
         dic["LR ({})".format(degree)] = [('r2',r2_p), ('kl',kl_p), ('nrmse',nrmse_p)]
         print("R2 for degree {} : {}".format(degree, metrics.r2_score(y_test, y_hat)))
    logger.info("==============================================")
    with open('output/Accuracy/multiple_methods_higgs.pkl', 'wb') as handle:
        pickle.dump(dic, handle)