示例#1
0
def get_pdf(assembly_id, range=(None, None), bill_ids=None):
    if bill_ids is not None and not bill_ids:
        return

    datadir = '%s/%s' % (DIR['data'], assembly_id)
    pdfdir = '%s/%s' % (DIR['pdf'], assembly_id)
    txtdir = '%s/%s' % (DIR['txt'], assembly_id)
    utils.check_dir(pdfdir)
    utils.check_dir(txtdir)

    failed = []
    jsons = os.listdir(datadir)[range[0]:range[1]]

    for json in jsons:
        if bill_ids and json.split('.', 1)[0] not in bill_ids:
            continue
        print json
        try:
            download(assembly_id, json, datadir, pdfdir)
            pdffile = '%s/%s' % (pdfdir, json.replace('json', 'pdf'))
            txtfile = '%s/%s' % (txtdir, json.replace('json', 'txt'))
            #TODO: apply celery
            try:
                pdf2txt(pdffile, txtfile)
            except (PSEOF, PDFSyntaxError) as e:
                print 'Failed parsing %s with %s' % (json, e)
                failed.append((json, e))
            except IOError as e:
                print 'File not exists' % (json, e)
                failed.append((json, e))

        except (IndexError, TypeError) as e:
            print 'Failed downloading %s with %s' % (json, e)
            failed.append((json, e))
    print 'Failed files: ', failed
示例#2
0
文件: html.py 项目: JH27/crawlers
def get_html(assembly_id, npages):

    def get_page(baseurl, page, directory, npages):
        try:
            url = baseurl + '&PAGE=%d&PAGE_SIZE=%d' % (page, PAGE_SIZE)
            pn = npages - page + 1
            fn = '%s/%d.html' % (directory, pn)

            is_first = True
            while is_first or 'TEXTAREA ID="MSG" STYLE="display:none"' in doc:
                doc = utils.get_webpage_text(url)
                is_first = False

            with open(fn, 'w') as f:
                f.write(doc)

            sys.stdout.write('%s\t' % pn)
            sys.stdout.flush()

        except (requests.exceptions.RequestException, IOError) as e:
            print '\nFailed to get %s due to %s' % (fn, e.__repr__)

    baseurl, directory = convert(assembly_id)
    utils.check_dir(directory)

    #
    print 'Downloading:'
    jobs = [gevent.spawn(get_page, baseurl, page, directory, npages)\
            for page in range(1, npages+1)]
    gevent.joinall(jobs)

    return npages
示例#3
0
def main():
    # Get arguments
    args = parse_args()
    filein, join_path, io_type = args.filename, args.join, args.type

    # Parse the document
    with open(filein) as fin:
        text = fin.read()
    examples = DocParser(text).parse()

    # Get files to join
    if join_path:
        targets = []
        filenames = get_joining_filenames(join_path, len(examples), io_type)
        for filename in filenames:
            with open(os.path.join(join_path, filename)) as fin:
                try:
                    target = json.load(fin, object_pairs_hook=OrderedDict)
                except json.JSONDecodeError:
                    raise ValueError('{} has invalid format'.format(filename)) from None
            targets.append(target)
    else:
        targets = (OrderedDict() for _ in iter(int, 1))

    # Turn data into output dicts
    dicts_in = dicts_out = [OrderedDict() for _ in range(len(examples))]
    extend_dicts(examples, dicts_in, dicts_out)

    # Check output directory
    check_dir(OUT_DIR)

    # Write target dicts to files
    export_examples(dicts_in, io_type, 'json')
示例#4
0
def main():
    args = argparser.parse_args()

    emf_file = args.emf
    out_root = args.outroot
    # img_root = args.imgroot
    # tree_root = args.treeroot
    clades_pickle = args.species_cache
    prefix = args.prefix

    all_species = utils.ens_get("/info/species/")["species"]
    all_species_names = [ it["name"].replace("_", " ") for it in all_species ]
    all_species_names.remove("Ancestral sequences")

    if path.exists(clades_pickle):
        Clades = pickle.load(open(clades_pickle, 'rb'))
    else:
        Clades = filter_clades(all_species_names,
                               [ "Eutheria", "Glires", "Laurasiatheria", "Sauria", "Mammalia", "Primates" ])
        pickle.dump(Clades, open(clades_pickle, 'wb'))

    pprint(Clades)

    TL = TCCList()
    TL.add(TCC(Clades[args.clade], operator.ge, args.thr))

    utils.check_dir(path.join(out_root, args.clade))

    tree_id = 1
    for tree in emf.EMF(emf_file):
        print tree_id
        # treedir = path.join(tree_root, str(tree_id)[:2])
        # utils.check_dir(treedir)

        # tree.write(outfile=path.join(treedir, "{}.nh".format(tree_id)))

        seqsets, subtrees = split_tree(tree, TL, prefix)
        outdir = path.join(out_root, args.clade, str(tree_id)[:2])
        utils.check_dir(outdir)

        # Treevis
        # layout = make_layout(seqsets)
        # imgdir = path.join(img_root, args.clade)
        # utils.check_dir(imgdir)
        # imgfile = path.join(imgdir, "{}.pdf".format(tree_id))
        # tree.render(imgfile, layout=layout)

        set_id = 1
        for seqset, subtree in zip(seqsets, subtrees):
            outfile = open(path.join(outdir, "{0}_{1}.tab".format(tree_id, set_id)), 'w')
            for seqid in seqset:
                print >>outfile, '\t'.join(seqid)

            subtree.write(outfile=path.join(outdir, "{0}_{1}.nh".format(tree_id, set_id)), 
                          format=6)
            set_id += 1

        tree_id += 1
示例#5
0
    def __write_fastqc(self, o, pobj, r):
        # Dict for reporter 
        incurr = {
          'leftseq': {'data': None,'summary': None}, 
          'rightseq': {'data': None,'summary': None}, 
          'paired': r.paired, 
          'readgroup': r.readgroup
        }
        ocurr = {
          'leftseq': None, 
          'rightseq': None,
          'paired': r.paired,
          'readgroup': r.readgroup
        }

        # Set files
        fqdir = os.path.join(pobj.files['results_path'], \
                  '{0.name}/00_Preprocessing/fastqc/{1.readgroup}'.format(self, r))
        o.write('# Fastqc\n')
        o.write('fastqcResultsDir = {0}\n'.format(fqdir))

        raw1 = r.leftseq 
        out1 = [i for i in [re.search('([\w\_\-\.\d]+)\.txt\.gz$', raw1),
                    re.search('([\w\_\-\.\d]+)\.txt$', raw1),
                    re.search('([\w\_\-\.\d]+)\.fastq\.gz$', raw1),
                    re.search('([\w\_\-\.\d]+)\.fastq$', raw1)] if i][0].group(1)
        o.write('leftFastqcResultsDir = {0}_fastqc\n'.format(out1)) 
        incurr['leftseq']['data'] = os.path.join(fqdir, '{0}_fastqc/fastqc_data.txt'.format(out1))
        incurr['leftseq']['summary'] = os.path.join(fqdir, '{0}_fastqc/summary.txt'.format(out1))
        ocurr['leftseq'] = os.path.join(pobj.files['report_path'], \
          'project_reads/{0.name}/{1.readgroup}/leftseq'.format(self, r))
        utils.check_dir(ocurr['leftseq'])
        self.reporter_files.append(os.path.join(ocurr['leftseq'], 'fastqc_data.txt')) 
        self.reporter_files.append(os.path.join(ocurr['leftseq'], 'fastqc_gcbd.txt')) 
        self.reporter_files.append(os.path.join(ocurr['leftseq'], 'fastqc_pbnc.txt')) 
        self.reporter_files.append(os.path.join(ocurr['leftseq'], 'fastqc_qbd.txt')) 

        if r.paired:
            raw2 = r.rightseq 
            out2 = [i for i in [re.search('([\w\_\-\.\d]+)\.txt\.gz$', raw2),
                        re.search('([\w\_\-\.\d]+)\.txt$', raw2),
                        re.search('([\w\_\-\.\d]+)\.fastq\.gz$', raw2),
                        re.search('([\w\_\-\.\d]+)\.fastq$', raw2)] if i][0].group(1)
            o.write('rightFastqcResultsDir = {0}_fastqc\n'.format(out2))
            incurr['rightseq']['data'] = os.path.join(fqdir, '{0}_fastqc/fastqc_data.txt'.format(out2))
            incurr['rightseq']['summary'] = os.path.join(fqdir, '{0}_fastqc/fastqc_summary.txt'.format(out2))
            ocurr['rightseq'] = os.path.join(pobj.files['report_path'], \
              'project_reads/{0.name}/{1.readgroup}/rightseq'.format(self, r))
            utils.check_dir(ocurr['rightseq'])
            self.reporter_files.append(os.path.join(ocurr['rightseq'], 'fastqc_data.txt')) 
            self.reporter_files.append(os.path.join(ocurr['rightseq'], 'fastqc_gcbd.txt')) 
            self.reporter_files.append(os.path.join(ocurr['rightseq'], 'fastqc_pbnc.txt')) 
            self.reporter_files.append(os.path.join(ocurr['rightseq'], 'fastqc_qbd.txt')) 
        o.write('\n')

        # Add to dict
        self.reporter_obj['reads']['inputs'][r.phenotype.lower()].append(incurr)
        self.reporter_obj['reads']['outputs'][r.phenotype.lower()].append(ocurr)
示例#6
0
文件: rrd_helper.py 项目: clan/ops
 def rrd_graph_sum(ldir, loc, width=1080, height=384):
     if len(loc['hosts']) == 0:
         logging.warn("location `%s': no host(s) found" % (loc['name'],))
         return
     lid = osp.join(ldir, loc['id'])
     check_dir(0755, lid)
     rrds.rrd_graph_sum_net(lid, loc['name'], loc['hosts'], width, height)
     rrds.rrd_graph_sum_udp(lid, loc['name'], loc['hosts'], width, height)
     rrds.rrd_graph_sum_cpu(lid, loc['name'], loc['hosts'], width, height)
     rrds.rrd_graph_sum_mem(lid, loc['name'], loc['hosts'], width, height)
     return
示例#7
0
    def output(self):
        job = Job(job_id=self.jobid)
        # print(job.path_map.dir_dict)
        self.output_file = os.path.join(job.path_map.tmp_dir, 'dir_dict.yaml')
        for path in job.path_map.dir_dict.itervalues():
            utils.check_dir(path=path)
            # print(path)

        yaml.dump(data=job.path_map.dir_dict, stream=open(self.output_file, 'w'))

        return luigi.LocalTarget(path=self.output_file)
示例#8
0
文件: kami.py 项目: luccox/vaca
    def checkpoint(self):
        self._update_request()
        occupation = numpy.array([len(self.wally[self.wally[i,j,:,0] != 0])
                                      for i in xrange(self.size)
                                      for j in xrange(self.size)]).reshape(self.size,self.size)
        utils.check_dir('wally')
        utils.pyplot_from_array(str(self.tick), occupation, self.capacity)

        self.conn.root['conf']['tick'] = self.tick
        self.conn.root['wally'] = self.wally
        self.conn.root['rookies'] = self.rookies
        self.conn.commit()
示例#9
0
文件: html.py 项目: JH27/crawlers
def get_npages(assembly_id):
    url, directory = convert(assembly_id)
    utils.check_dir(directory)

    fn = '%s/tmp.html' % directory
    utils.get_webpage(url, fn)
    page = utils.read_webpage(fn)
    m = re.search(u'총(.+)건', page.xpath('//span[@class="text3"]/text()')[0])
    nbills = int(m.group(1))
    npages = int(math.ceil(nbills/float(PAGE_SIZE)))
    print 'Total %d bills, %d pages to %s' % (nbills, npages, directory)
    return npages
示例#10
0
    def search(self, data, classes):
        """
        Searches for the patterns based on expression data 
        
        Input
        -----
        data : numpy array[n_samples, n_genes], GE data
        classes : numpy zero-one array[n_samples]
        """
        self.patterns = []
        c = 0
        time_d = 0
        for seed in self.seeds:
            # print seed
            c += 1
            if self.verbose and c % 100 == 0:
                print "Searching with seed %s" % str(seed)
                print np.mean(time_d)
                time_d = 0

            pattern = self.search_method.create_pattern(data, seed)
            pattern.evaluate(data, self.metric, classes)
            st = time.clock()
            while True:
                next_pattern = max(
                    pattern.expand(self.network, self.radius), key=lambda ex: ex.evaluate(data, self.metric, classes)
                )
                if (next_pattern.score / pattern.score) > 1 + self.min_improve:
                    pattern = next_pattern
                # print "zlepseni",pattern.score
                else:
                    break
            # pattern.edges = filter_edges(pattern.edges, pattern.genes)
            time_d += time.clock() - st
            if self.trivial_patterns or len(list(seed)[0]) > 2:
                self.patterns += [pattern]
                check_dir(self.base_dir + "greedy_search_pics/")
                if self.draw:
                    gene_color = dict()
                    for gene in pattern.genes:
                        edges_names = set((self.gene_names[h1], self.gene_names[h2]) for (h1, h2) in pattern.edges)
                        # a function to color a gene in discovered pattern
                        gene_color[self.gene_names[gene]] = scipy.stats.ttest_ind(data[:, -1], GE_profile=data[:, gene])
                    print "Drawing a graph for seed %s" % str(seed)
                    draw_graph(edges_names, self.base_dir + "greedy_search_pics/test-graph-greedy", seed)

            # if seed > 550:
            #     break

        return self.patterns
示例#11
0
def html2json(assembly_id, range=(None, None), bill_ids=None):
    if bill_ids is not None and not bill_ids:
        return

    metafile = '%s/%d.csv' % (DIR['meta'], assembly_id)
    print metafile
    meta = pd.read_csv(metafile, dtype={'bill_id': object, 'link_id': object})

    jsondir = '%s/%s' % (DIR['data'], assembly_id)
    utils.check_dir(jsondir)

    if not bill_ids:
        bill_ids = meta['bill_id'][range[0]:range[1]]
    jobs = [gevent.spawn(parse_page, assembly_id, bill_id, meta, jsondir) for bill_id in bill_ids]

    gevent.joinall(jobs)
示例#12
0
def main():
    # Get arguments
    args = parse_args()

    # Parse the document
    with open(args.filename) as fin:
        text = fin.read()
    examples = DocParser(text).findall()

    # Check output directory
    check_dir(OUT_DIR)

    # Generate templates
    def gen(gters):
        for gter in gters:
            gter(examples).generate()
    gen([PythonGenerator, JavaGenerator])
示例#13
0
def html2csv(assembly_id, npages):
    def list_to_file(l, f):
        f.write('"')
        f.write('","'.join(l).encode("utf-8"))
        f.write('"\n')

    def parse_columns(columns):
        data = []
        for j, c in enumerate(columns):
            if j == 1:
                status = str(int(re.findall(r"[0-9]+", c.xpath("img/@src")[0])[0]))
                title = c.xpath("a/text()")[0].replace('"', "'")
                link = re.findall(r"\w+", c.xpath("a/@href")[0])[2]
                data.extend([status, title, link])
            elif j == 6:
                data.append("1" if c.xpath("img/@onclick") else "0")
            else:
                data.append(c.xpath("text()")[0].strip())
        return data

    def parse_page(page, f, assembly_id):
        fn = "%s/%s/%d.html" % (DIR["list"], assembly_id, page)
        p = utils.read_webpage(fn)
        rows = utils.get_elems(p, X["table"])

        for r in reversed(rows):
            columns = r.xpath(X["columns"])
            if len(columns) == 8:
                p = parse_columns(columns)
                list_to_file(p, f)

        sys.stdout.write("%d\t" % page)
        sys.stdout.flush()

    directory = DIR["meta"]
    utils.check_dir(directory)
    meta_data = "%s/%d.csv" % (directory, assembly_id)

    print "\nParsing:"
    with open(meta_data, "wa") as f:
        list_to_file(META_HEADERS, f)
        for page in range(1, npages + 1):
            parse_page(page, f, assembly_id)

    print "\nMeta data written to " + meta_data
示例#14
0
文件: main.py 项目: JH27/crawlers
def main(args):
    printer = print_csv if args.test else print_json
    filetype = 'csv' if args.test else 'json'
    datadir = args.directory if args.directory else '.'
    check_dir(datadir)

    if args.target=='local':
        if args.end:
            jobs = []
            args.level = get_election_type_name(args.level)
            for n in xrange(args.start, args.end+1):
                filename = '%s/%s-%s-%s-%d.%s'\
                    % (datadir, args.target, args.level, args.type, n, filetype)
                job = gevent.spawn(crawl, target=args.target, level=args.level,\
                    _type=args.type, nth=n, filename=filename, printer=printer)
                jobs.append(job)
            gevent.joinall(jobs)
        else:
            n = args.start
            args.level = get_election_type_name(args.level)
            filename = '%s/%s-%s-%s-%.01f.%s' %\
                    (datadir, args.target, args.level, args.type, n, filetype)
            crawl(target=args.target, level=args.level, _type=args.type, nth=n,\
                        filename=filename, printer=printer)
    else:
        if args.end:
            jobs = []
            for n in xrange(args.start, args.end+1):
                filename = '%s/%s-%s-%d.%s'\
                        % (datadir, args.target, args.type, n, filetype)
                job = gevent.spawn(crawl, target=args.target, _type=args.type, nth=n,\
                        filename=filename, printer=printer)
                jobs.append(job)
            gevent.joinall(jobs)
        else:
            n = args.start
            filename = '%s/%s-%s-%.01f.%s' %\
                    (datadir, args.target, args.type, n, filetype)
            crawl(target=args.target, _type=args.type, nth=n,\
                        filename=filename, printer=printer)
    print 'Data written to %s' % filename
示例#15
0
文件: pdf.py 项目: dongx3/crawlers
def get_pdf(assembly_id, range=(None, None), bill_ids=None):
    if bill_ids is not None and not bill_ids:
        return

    indir = '%s/%s' % (DIR['data'], assembly_id)
    outdir = '%s/%s' % (DIR['pdf'], assembly_id)
    utils.check_dir(outdir)

    failed = []
    jsons = os.listdir(indir)[range[0]:range[1]]

    for json in jsons:
        if bill_ids and json.split('.', 1)[0] not in bill_ids:
            continue

        try:
            download(assembly_id, json, indir, outdir)
        except (IndexError, TypeError) as e:
            print 'Failed downloading %s with %s' % (json, e)
            failed.append((json, e))
    print 'Failed files: ', failed
示例#16
0
文件: get_html.py 项目: JH27/crawlers
def get_html(assembly_id, range=(None, None), bill_ids=None):
    if bill_ids is not None and not bill_ids:
        return

    for field in HTML_FIELDS:
        utils.check_dir('%s/%s' % (DIR[field], assembly_id))

    metadata = get_metadata(assembly_id, range=range)

    for bill_id in metadata:
        if bill_id == 'bill_id':
            continue

        if bill_ids and bill_id not in bill_ids:
            continue

        link_id, has_summaries = metadata[bill_id]
        for field in HTML_FIELDS[1:3]:
            get_page(assembly_id, bill_id, link_id, field)
        get_specifics(assembly_id, bill_id, link_id)
        get_summaries(assembly_id, bill_id, link_id, has_summaries)

        sys.stdout.write('%s\t' % bill_id)
        sys.stdout.flush()
示例#17
0
    def _set_project_files(self):
        """Sets the project path, log, jobs, config, and results paths for the project.

           project_path - parent directory for this project
           config_path - parent directory for all config files within this project
           job_path - parent directory for all job files within this project
           log_path - parent directory for all log files within this project
           results_path - parent directory for all results files within this project
           report_path - parent directory for all report files within this project
        """
        self.files = {
            "project_path": os.path.abspath(self.args.output_directory),
            "config_path": os.path.join(os.path.abspath(self.args.output_directory), "config"),
            "log_path": os.path.join(os.path.abspath(self.args.output_directory), "logs"),
            "results_path": os.path.join(os.path.abspath(self.args.output_directory), "results"),
            "report_path": os.path.join(os.path.abspath(self.args.output_directory), "report"),
        }
        [check_dir(i) for i in self.files.values()]
示例#18
0
import os
from utils import check_dir, check_file

# config
IP = '0.0.0.0'
PORT = 6677
DGRAM_FORMAT = '50s50s50s200s'
CMD_FORMAT = '50s50s50s'
BASE_DIR = './demo'
DEBUG = True
PID_FILE = os.path.join(BASE_DIR, 'pychat_server.pid')
LOG_FILE = os.path.join(BASE_DIR, 'pychat_server.log')

# const
USER_PATH = os.path.join(BASE_DIR, 'user')
MSG_PATH = os.path.join(BASE_DIR, 'msg')
FILE_PATH = os.path.join(BASE_DIR, 'file')

history_msg_file = os.path.join(MSG_PATH, 'history.pk')
offline_msg_file = os.path.join(MSG_PATH, 'offline.pk')

user_file = os.path.join(USER_PATH, 'user.pk')
friend_file = os.path.join(USER_PATH, 'friends.pk')

file_info = os.path.join(FILE_PATH, 'file_info.txt')

check_dir(USER_PATH)
check_dir(MSG_PATH)
check_dir(FILE_PATH)
check_file(file_info)
示例#19
0
    def __set_reporter_obj(self, pobj, settings):
        '''Builds the basics of the reporter_obj'''
        self.reporter_obj = {
          'module': 'PipelineReport',
          'software': {
            'python': settings.system['python']
          },
          'parameters': {'project': pobj.name, 'sample': self.name},
          'options': settings.options['annovar'],
          'reads': {
            'inputs': {'normal': [], 'tumor': []},
            'outputs': {'normal': [], 'tumor': []} 
          },
          'alignments': {'inputs': {}, 'outputs': {}},
          'somatic': {'inputs': [], 'outputs': {}}
        }
       
        # INPUTS
        for a in ['bwa_aln', 'bwa_mem', 'novoalign']:
            if a in settings.aln_list:
                # ALIGNMENTS
                self.reporter_obj['alignments']['inputs'][a] = {
                  'normal': {
                    'alignment_summary_metrics': os.path.join(pobj.files['results_path'], \
                      '{0.name}/01_Alignments/{1}/{0.name}-NORMAL.{1}.metrics.alignment_summary_metrics'.format(
                      self, a)),
                    'insert_size_metrics': os.path.join(pobj.files['results_path'], \
                      '{0.name}/01_Alignments/{1}/{0.name}-NORMAL.{1}.metrics.insert_size_metrics'.format(
                      self, a)),
                    'total_coverage': os.path.join(pobj.files['results_path'], \
                      '{0.name}/01_Alignments/{1}/{0.name}-NORMAL.{1}.{2}.exons.bed'.format(
                      self, a, pobj.assembly['refname']))
                  },
                  'tumor': {
                    'alignment_summary_metrics': os.path.join(pobj.files['results_path'], \
                      '{0.name}/01_Alignments/{1}/{0.name}-TUMOR.{1}.metrics.alignment_summary_metrics'.format(
                      self, a)),
                    'insert_size_metrics': os.path.join(pobj.files['results_path'], \
                      '{0.name}/01_Alignments/{1}/{0.name}-TUMOR.{1}.metrics.insert_size_metrics'.format(
                      self, a)),
                    'total_coverage': os.path.join(pobj.files['results_path'], \
                      '{0.name}/01_Alignments/{1}/{0.name}-TUMOR.{1}.{2}.exons.bed'.format(
                      self, a, pobj.assembly['refname']))
                  },
                }
               
                # SMD
                for s in ['mutect', 'shimmer', 'sniper', 'strelka', 'varscan', 'virmid']:
                    if s in settings.smd_list: 
                        self.reporter_obj['somatic']['inputs'].append({
                          'aln': a, 'smd': s,
                          'annovar': os.path.join(pobj.files['results_path'], \
                            '{0.name}/04_VariantAnnotation/'.format(self) + \
                            '{0.name}.{1}.{2}.vcf.annovar.anno.{3}_multianno.txt'.format(
                            self, s, a, pobj.assembly['refname'])),
                          'vcf': os.path.join(pobj.files['results_path'], \
                            '{0.name}/03_SomaticMutations/'.format(self) + \
                            '{0.name}.{1}.{2}.final.vcf'.format(self, s, a))
                        })

        # OUTPUTS
        self.reporter_obj['alignments']['outputs'] = {
          'alignment_summary_metrics': os.path.join(pobj.files['report_path'], \
            'project_alignments/{0.name}.alignment_summary_metrics.txt'.format(self)),
          'insert_size_metrics': os.path.join(pobj.files['report_path'], \
            'project_alignments/{0.name}.insert_size_metrics.txt'.format(self)),
          'total_coverage': os.path.join(pobj.files['report_path'], \
            'project_alignments/{0.name}.total_coverage.txt'.format(self)),
          'path': os.path.join(pobj.files['report_path'], 'project_alignments')
        }
        self.reporter_obj['somatic']['outputs'] = {
          'smd_snp_table': os.path.join(pobj.files['report_path'], \
            'project_somatic/{0.name}.somatic.snps.tsv'.format(self))
        }

        # Append to reporter_files
        anames = ['alignment_summary_metrics', 'insert_size_metrics', 'total_coverage']
        [self.reporter_files.append(self.reporter_obj['alignments']['outputs'][i]) for i in anames]
        self.reporter_files.append(self.reporter_obj['somatic']['outputs']['smd_snp_table'])

        # Check output paths
        utils.check_dir(self.reporter_obj['alignments']['outputs']['path']) 
        utils.check_dir(os.path.abspath(os.path.dirname(self.reporter_obj['somatic']['outputs']['smd_snp_table']))) 
示例#20
0
def parse_args():
    desc = 'TensorFlow 2.0 implementation of Residual Attribute Generative Adversarial Network (RAG)'
    parser = argparse.ArgumentParser(description=desc)

    parser.add_argument('--dataset_name', type=str, default='celeba')
    parser.add_argument('--phase',
                        type=str,
                        default='tfrecord',
                        choices=('tfrecord', 'train', 'test'))
    parser.add_argument('--img_size', type=int, default=128)
    parser.add_argument('--img_nc', type=int, default=3)

    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--lr', type=float, default=0.0001)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--decay_epochs', type=int, default=10)
    parser.add_argument('--w_adv', type=float, default=1)
    parser.add_argument('--w_cls', type=float, default=10)
    parser.add_argument('--w_cyc', type=float, default=10)
    parser.add_argument('--w_rec', type=float, default=10)
    parser.add_argument('--w_a', type=float, default=1)
    parser.add_argument('--w_tv', type=float, default=2.5)
    parser.add_argument('--gan_type',
                        type=str,
                        default='lsgan',
                        choices=('vanilla', 'lsgan', 'hinge'))

    parser.add_argument('--log_freq', type=int, default=1000)
    parser.add_argument('--output_dir', type=str, default='output')
    parser.add_argument('--log_dir', type=str, default='log')
    parser.add_argument('--sample_dir', type=str, default='sample')
    parser.add_argument('--save_dir', type=str, default='model')
    parser.add_argument('--result_dir', type=str, default='result')
    parser.add_argument('--test_img', type=str, default='000009.jpg')

    args = parser.parse_args()
    check_dir(args.output_dir)
    args.output_dir = os.path.join(args.output_dir, f'RAG_{args.dataset_name}')
    check_dir(args.output_dir)
    args.log_dir = os.path.join(args.output_dir, args.log_dir)
    check_dir(args.log_dir)
    args.sample_dir = os.path.join(args.output_dir, args.sample_dir)
    check_dir(args.sample_dir)
    args.save_dir = os.path.join(args.output_dir, args.save_dir)
    check_dir(args.save_dir)
    args.result_dir = os.path.join(args.output_dir, args.result_dir)
    check_dir(args.result_dir)

    if args.dataset_name == 'celeba':
        args.shorter_size = 178
        args.attrs = [
            'Black_Hair', 'Blond_Hair', 'Brown_Hair', 'Male', 'Young',
            'Eyeglasses', 'Mouth_Slightly_Open', 'Pale_Skin', 'Rosy_Cheeks',
            'Smiling', 'Heavy_Makeup'
        ]
        args.label_nc = len(args.attrs)

    return args
示例#21
0
文件: train.py 项目: yf1291/nlp3
    )

    dloader_val = data_loader(
        dataset=dataset_val,
        nKnovel=opt.test_way,
        nKbase=0,
        nExemplars=opt.val_shot, # num training examples per novel category
        nTestNovel=opt.val_query * opt.test_way, # num test examples for all the novel categories
        nTestBase=0, # num test examples for all the base categories
        batch_size=1,
        num_workers=0,
        epoch_size=1 * opt.val_episode, # num of batches per epoch
    )

    set_gpu(opt.gpu)
    check_dir('./experiments/')
    check_dir(opt.save_path)
    
    log_file_path = os.path.join(opt.save_path, "train_log.txt")
    log(log_file_path, str(vars(opt)))

    (embedding_net, cls_head) = get_model(opt)
    
    optimizer = torch.optim.SGD([{'params': embedding_net.parameters()}, 
                                 {'params': cls_head.parameters()}], lr=0.1, momentum=0.9, \
                                          weight_decay=5e-4, nesterov=True)
    
    lambda_epoch = lambda e: 1.0 if e < 20 else (0.06 if e < 40 else 0.012 if e < 50 else (0.0024))
    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_epoch, last_epoch=-1)

    max_val_acc = 0.0
示例#22
0
文件: const.py 项目: onestraw/pychat
IP = '127.0.0.1'
PORT = 6677
DGRAM_FORMAT = '50s50s50s200s'
CMD_FORMAT = '50s50s50s'
BASE_DIR = './demo'
DEBUG = True
PID_FILE = os.path.join(BASE_DIR, 'pychat_server.pid')
LOG_FILE = os.path.join(BASE_DIR, 'pychat_server.log')

# const
USER_PATH = os.path.join(BASE_DIR, 'user')
MSG_PATH = os.path.join(BASE_DIR, 'msg')
FILE_PATH = os.path.join(BASE_DIR, 'file')


history_msg_file = os.path.join(MSG_PATH, 'history.pk')
offline_msg_file = os.path.join(MSG_PATH, 'offline.pk')


user_file = os.path.join(USER_PATH, 'user.pk')
friend_file = os.path.join(USER_PATH, 'friends.pk')


file_info = os.path.join(FILE_PATH, 'file_info.txt')


check_dir(USER_PATH)
check_dir(MSG_PATH)
check_dir(FILE_PATH)
check_file(file_info)
示例#23
0
def train(model, train_loader, test_loader, dev_loader, optimizer, conf,
          logger):
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=conf['num_epochs'] * len(train_loader), eta_min=1e-6)

    model.train()
    best_random_error = 100.0
    iter_per_epoch = len(train_loader)

    if conf['rank'] == 0:
        summary_dir = os.path.join(conf['exp_dir'], 'tensorX_log')
        check_dir(summary_dir)
        tb_writer = SummaryWriter(summary_dir)

    for epoch in range(conf['num_epochs']):
        acc_sum = 0.  # Accuracy
        epoch_loss = 0.0

        model.train()

        if conf['rank'] == 0:
            t_bar = tqdm(ncols=100,
                         total=iter_per_epoch,
                         desc='Epoch:{}'.format(epoch))
        for iter_idx, (images, labels, loss_weight) in enumerate(train_loader):
            if conf['rank'] == 0:
                t_bar.update()

            images = images.to(conf['device'])
            labels = labels.to(conf['device'])
            loss_weight = loss_weight.to(conf['device'])

            optimizer.zero_grad()
            seg_res = model(images)
            seg_prob = torch.sigmoid(seg_res)

            seg_res_flat = seg_res.view(seg_res.size(0), -1)
            labels_flat = labels.view(labels.size(0), -1)
            loss_weight_flat = loss_weight.view(loss_weight.size(0), -1)

            loss = F.binary_cross_entropy_with_logits(seg_res_flat,
                                                      labels_flat,
                                                      reduction='none')
            loss = torch.mean(loss * loss_weight_flat)
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()

            acc = get_accuracy(seg_prob, labels)
            acc_sum += acc

            step_idx = epoch * iter_per_epoch + iter_idx
            if conf['rank'] == 0:
                tb_writer.add_scalar("acc_step", acc, step_idx)
                tb_writer.add_scalar("loss_step", loss.item(), step_idx)

        if conf['rank'] == 0:
            t_bar.close()

        acc_sum = acc_sum / iter_per_epoch

        epoch_loss /= iter_per_epoch
        current_lr = optimizer.param_groups[0]['lr']

        # logger.info("[Train] Rank: {} Epoch: [{}/{}] Acc: {:.3f} Loss: {:.3f} Lr:{:.3e}".format(conf['rank'],
        #                                                 epoch, conf['num_epochs'],
        #                                                 acc, epoch_loss, current_lr))

        test_acc, test_error, test_pre, test_recall, test_split, test_merge = test(
            model, test_loader, conf, logger, epoch, best_random_error)
        dev_acc, dev_error, dev_pre, dev_recall, dev_split, dev_merge = dev_eval(
            model, dev_loader, conf)

        logger.info(
            "[Train] Rank: {} Epoch: [{}/{}] Acc: {:.3f} Loss: {:.3f} Lr:{:.3e} "
            "R_error: {:.3f} R_pre: {:.3f} R_recall: {:.3f}"
            " F_split: {:.2f} F_merge: {:.2f}".format(conf['rank'], epoch,
                                                      conf['num_epochs'],
                                                      acc_sum, epoch_loss,
                                                      current_lr, dev_error,
                                                      dev_pre, dev_recall,
                                                      dev_split, dev_merge))
        if conf['rank'] == 0:
            tb_writer.add_scalar("test_acc", test_acc, epoch)
            tb_writer.add_scalar("test_error", test_error, epoch)
            tb_writer.add_scalar("test_pre", test_pre, epoch)
            tb_writer.add_scalar("test_recall", test_recall, epoch)
            tb_writer.add_scalar("test_split", test_split, epoch)
            tb_writer.add_scalar("test_merge", test_merge, epoch)

            tb_writer.add_scalar("train_acc", dev_acc, epoch)
            tb_writer.add_scalar("train_error", dev_error, epoch)
            tb_writer.add_scalar("train_pre", dev_pre, epoch)
            tb_writer.add_scalar("train_recall", dev_recall, epoch)
            tb_writer.add_scalar("train_split", dev_split, epoch)
            tb_writer.add_scalar("train_merge", dev_merge, epoch)

        if best_random_error > test_error and conf['rank'] == 0:
            best_random_error = test_error
            save_name = 'Best'
            state_dict = {'model': model.module.state_dict()}
            save_checkpoint(state_dict,
                            conf['checkpoint_format'].format(save_name))

        if epoch % conf['save_per_epoch'] == 0 and conf['rank'] == 0:
            save_name = 'Epoch-{}'.format(epoch)
            state_dict = {'model': model.module.state_dict()}
            save_checkpoint(state_dict,
                            conf['checkpoint_format'].format(save_name))

    if conf['rank'] == 0:
        tb_writer.close()
示例#24
0
def main():
    from version import __version__
    import circ
    import pipeline
    from logger import get_logger
    from utils import check_file, check_dir, check_config, get_thread_num
    from utils import CIRCparser, TOOLS

    # Init argparser
    parser = argparse.ArgumentParser(prog='CIRIquant')

    # required arguments
    parser.add_argument(
        '--config',
        dest='config_file',
        metavar='FILE',
        help='Config file in YAML format',
    )
    parser.add_argument(
        '-1',
        '--read1',
        dest='mate1',
        metavar='MATE1',
        help='Input mate1 reads (for paired-end data)',
    )
    parser.add_argument(
        '-2',
        '--read2',
        dest='mate2',
        metavar='MATE2',
        help='Input mate2 reads (for paired-end data)',
    )

    # optional arguments
    parser.add_argument(
        '-o',
        '--out',
        dest='output',
        metavar='DIR',
        default=None,
        help='Output directory, default: ./',
    )
    parser.add_argument(
        '-p',
        '--prefix',
        dest='prefix',
        metavar='PREFIX',
        default=None,
        help='Output sample prefix, default: input sample name',
    )
    parser.add_argument(
        '-t',
        '--threads',
        dest='cpu_threads',
        default=4,
        metavar='INT',
        help='Number of CPU threads, default: 4',
    )
    parser.add_argument(
        '-a',
        '--anchor',
        dest='anchor',
        default=5,
        metavar='INT',
        help='Minimum anchor length for junction alignment, default: 5',
    )
    parser.add_argument(
        '-l',
        '--libary-type',
        dest='library_type',
        metavar='INT',
        default=0,
        help='Library type, 0: unstranded, 1: read1 match the sense strand,'
        '2: read1 match the antisense strand, default: 0',
    )

    parser.add_argument(
        '-v',
        '--verbose',
        dest='verbosity',
        default=False,
        action='store_true',
        help='Run in debugging mode',
    )
    parser.add_argument(
        '--version',
        action='version',
        version='%(prog)s {version}'.format(version=__version__))
    parser.add_argument(
        '-e',
        '--log',
        dest='log_file',
        default=None,
        metavar='LOG',
        help='Log file, default: out_dir/prefix.log',
    )

    # provide pre-defined list of circRNAs
    parser.add_argument(
        '--bed',
        dest='bed',
        metavar='FILE',
        default=None,
        help='bed file for putative circRNAs (optional)',
    )
    parser.add_argument(
        '--circ',
        dest='circ',
        metavar='FILE',
        default=None,
        help='circRNA prediction results from other softwares',
    )
    parser.add_argument(
        '--tool',
        dest='tool',
        metavar='TOOL',
        default=None,
        help='circRNA prediction tool, required if --circ is provided',
    )

    # when provide RNase R result, do RNase R correction
    parser.add_argument(
        '--RNaseR',
        dest='rnaser',
        metavar='FILE',
        default=None,
        help='CIRIquant result of RNase R sample',
    )

    # skip hisat2 alignment for RNA-seq data
    parser.add_argument(
        '--bam',
        dest='bam',
        metavar='BAM',
        default=None,
        help='hisat2 alignment to reference genome',
    )

    # skip stringtie prediction
    parser.add_argument(
        '--no-gene',
        dest='gene_exp',
        default=False,
        action='store_true',
        help='Skip stringtie estimation for gene abundance',
    )

    args = parser.parse_args()
    """Check required parameters"""
    # check input reads
    if args.mate1 and args.mate2:
        reads = [check_file(args.mate1), check_file(args.mate2)]
    else:
        sys.exit(
            'No input files specified, please see manual for detailed information'
        )

    try:
        lib_type = int(args.library_type)
    except ValueError:
        sys.exit(
            'Wrong library type, please check your command.\nSupported types:\n0 - unstranded;\n'
            '1 - read1 match the sense strand;\n2 - read1 match the antisense strand;'
        )

    if lib_type not in [0, 1, 2]:
        sys.exit(
            'Wrong library type, please check your command.\nSupported types:\n0 - unstranded;\n'
            '1 - read1 match the sense strand;\n2 - read1 match the antisense strand;'
        )

    # check configuration
    if args.config_file:
        config = check_config(check_file(args.config_file))
    else:
        sys.exit(
            'A config file is needed, please see manual for detailed information.'
        )
    """Check optional parameters"""
    # use circRNA bed file if provided
    bed_file = check_file(args.bed) if args.bed else None
    circ_file = check_file(args.circ) if args.circ else None
    circ_tool = args.tool

    # user provided RNase R CIRIquant results
    rnaser_file = check_file(args.rnaser) if args.rnaser else None

    # pre aligned hisat2 bam
    hisat_bam = check_file(args.bam) if args.bam else None

    # Output prefix
    if args.prefix is None:
        try:
            prefix = re.search(r'(\S+)[_/-][12]',
                               os.path.basename(reads[0])).group(1)
        except AttributeError:
            sys.exit(
                'Ambiguous sample name, please manually select output prefix')
    else:
        prefix = args.prefix

    # check output dir
    outdir = './' + prefix if args.output is None else args.output
    outdir = check_dir(outdir)

    # Parse arguments
    log_file = os.path.abspath(
        args.log_file) if args.log_file else '{}/{}.log'.format(
            outdir, prefix)
    verbosity = args.verbosity
    logger = get_logger('CIRIquant', log_file, verbosity)

    # Add lib to PATH
    lib_path = os.path.dirname(os.path.split(
        os.path.realpath(__file__))[0]) + '/libs'
    os.environ['PATH'] = lib_path + ':' + os.environ['PATH']
    os.chmod(lib_path + '/CIRI2.pl', 0o755)
    """Start Running"""
    os.chdir(outdir)
    logger.info(
        'Input reads: ' +
        ','.join([os.path.basename(args.mate1),
                  os.path.basename(args.mate2)]))

    if lib_type == 0:
        lib_name = 'unstranded'
    elif lib_type == 1:
        lib_name = 'ScriptSeq'
    elif lib_type == 2:
        lib_name = 'TAKARA SMARTer'
    else:
        sys.exit(
            'Unsupported library type, please check the manual for instructions.'
        )

    logger.info('Library type: {}'.format(lib_name))
    logger.info('Output directory: {}, Output prefix: {}'.format(
        outdir, prefix))
    logger.info('Config: {} Loaded'.format(config))

    thread = get_thread_num(int(args.cpu_threads))
    anchor = int(args.anchor)

    # Step1: Data Preparation
    # Step1.1: HISAT2 mapping
    if hisat_bam is None:
        logger.info('Align RNA-seq reads to reference genome ..')
        hisat_bam = pipeline.align_genome(log_file, thread, reads, outdir,
                                          prefix)
    else:
        logger.info(
            'HISAT2 alignment bam provided, skipping alignment step ..')
    logger.debug('HISAT2 bam: {}'.format(os.path.basename(hisat_bam)))

    # Step1.2: Estimate Gene Abundance
    if args.gene_exp:
        logger.info('Skipping gene abundance estimation')
    else:
        pipeline.gene_abundance(log_file, thread, outdir, prefix, hisat_bam)

    # Step3: run CIRI2
    if bed_file:
        logger.info(
            'Using user-provided circRNA bed file: {}'.format(bed_file))
    else:
        if circ_file or circ_tool:
            if circ_file and circ_tool:
                logger.info(
                    'Using predicted circRNA results from {}: {}'.format(
                        circ_tool, circ_file))
                circ_parser = CIRCparser(circ_file, circ_tool)
            else:
                sys.exit(
                    '--circ and --tool must be provided in the same time!')
        else:
            logger.info(
                'No circRNA information provided, run CIRI2 for junction site prediction ..'
            )
            bwa_sam = pipeline.run_bwa(log_file, thread, reads, outdir, prefix)
            ciri_file = pipeline.run_ciri(log_file, thread, bwa_sam, outdir,
                                          prefix)
            circ_parser = CIRCparser(ciri_file, 'CIRI2')

        bed_file = '{}/{}.bed'.format(outdir, prefix)
        circ_parser.convert(bed_file)

    # Step4: estimate circRNA expression level
    out_file = circ.proc(log_file, thread, bed_file, hisat_bam, rnaser_file,
                         reads, outdir, prefix, anchor, lib_type)

    # Remove temporary files
    pipeline.clean_tmp(outdir, prefix)

    logger.info('circRNA Expression profile: {}'.format(
        os.path.basename(out_file)))

    logger.info('Finished!')
        negative_words = pd.read_csv("../asset/negative-words.txt",
                                     header=None,
                                     encoding='latin-1')

        positive_words_list = convert_words_list(positive_words)

        #remove word trump from positive word list
        positive_words_list = [
            i for i in positive_words_list if i not in "trump"
        ]
        negative_words_list = convert_words_list(negative_words)
        df = scoring_tweets(df, "text", positive_words_list,
                            negative_words_list)
        print("Tagging Finished !")
        # save
        check_dir(opt.output_path)
        df.to_csv(os.path.join(opt.output_path, "{}.csv".format("tagging")),
                  index=False)
        #------------------------
        # exp
        #------------------------
        df = pd.read_csv(
            os.path.join(opt.output_path, "{}.csv".format("tagging")))
        df.dropna(subset=["text"], inplace=True)
        print("num_data : ", len(df))
        df_train = df.sample(frac=0.8)
        print("num_training_data : ", len(df_train))
        df_test = df[~df.index.isin(df_train.index)]
        print("num_testing_data : ", len(df_test))
        assert len(df_train) + len(df_test) == len(
            df), "it should be the same."
import os

import numpy as np

import viz
from dataset import load_dataset
from utils import check_dir

dataset_id = "omniscient"
datasets_dir = "datasets/"
runs_dir = os.path.join(datasets_dir, dataset_id)
videos_dir = os.path.join(runs_dir, 'videos')
check_dir(videos_dir)

# Load the dataset
dataset = load_dataset(runs_dir)
dataset.load()

# Select the last step of each run
last_steps = dataset.groupby("run").map(lambda x: x.isel(sample=-1))
runs = last_steps.run

# Only choose runs that we're interrupted before reaching the goal.
# runs = runs.where(last_steps.goal_reached == False)

n_runs = 10

for _ in range(n_runs):
    run_id = np.random.choice(runs)
    run = dataset.where(dataset.run == run_id, drop=True)
示例#27
0
    def __init__(self,
                 batch_size,
                 en_optimizer,
                 de_optimizer,
                 en_learning_rate,
                 de_learning_rate,
                 attn_method,
                 train_data_engine,
                 test_data_engine,
                 use_embedding,
                 en_use_attr_init_state,
                 en_hidden_size=100,
                 de_hidden_size=100,
                 en_vocab_size=None,
                 de_vocab_size=None,
                 vocab_size=None,
                 en_embedding_dim=None,
                 de_embedding_dim=None,
                 embedding_dim=None,
                 embeddings=None,
                 en_embedding=True,
                 share_embedding=True,
                 n_decoders=2,
                 cell="GRU",
                 n_en_layers=1,
                 n_de_layers=1,
                 bidirectional=False,
                 feed_last=False,
                 repeat_input=False,
                 batch_norm=False,
                 model_dir="./model",
                 log_dir="./log",
                 is_load=True,
                 check_mem_usage_batches=0,
                 replace_model=True,
                 finetune_embedding=False,
                 model_config=None):

        # Initialize attributes
        self.data_engine = train_data_engine
        self.check_mem_usage_batches = check_mem_usage_batches
        self.n_decoders = n_decoders
        self.log_dir = log_dir
        self.model_dir = model_dir
        self.en_embedding_dim = en_embedding_dim
        self.de_embedding_dim = de_embedding_dim
        self.embedding_dim = embedding_dim
        self.repeat_input = repeat_input

        # Initialize embeddings, encoders and decoders
        """
        There are some available options here, most of which matter when using
        E2E dataset.
        (You still can use them while using dialogue generation dataset
        like CMDC, but it's NOT RECOMMENDED.)

        1) en_embedding (default True):
            If the option is on, we're going to add embedding layer into
            encoder; otherwise, the one-hot vectors are directly fed into
            encoder's RNN.
            For now, the decoder always has an embedding layer; this is
            because that we assumed that the decoder should always output the
            natural language, and it's reasonable that using an embedding layer
            instead of directly pass one-hot vectors into RNN.
        2) share_embedding (default True):
            If the option is on, first you should make sure that the input of
            encoder and decoder are in same vector space,
            (e.g. both natural language); otherwise, it will cause some strange
            result, (it is possible that you can train the model without any
            error, but the shared embedding layer doesn't make sense, as you
            should know.)
            When the option is on, the embedding dimension will be the argument
            embedding_dim, and the vocabulary size will be vocab_size; the
            argument en_embedding_dim, de_embedding_dim, en_vocab_size and
            de_vocab_size won't be used.
        3) use_embedding (default True):
            When the option is on:
            (1) If share_embedding option is on, the shared embedding will be
            initialized with the embeddings we pass into the model.
            (2) If en_embedding is on while share_embedding option being off,
            only the embedding in decoder will be initialized with the
            pre-trained embeddings, and the encoder embeddings will be trained
            from scratch (this combination of options is NOT APPROPRIATE when
            using dialogue generation dataset, as you should know, it's kind
            of strange that we only initialize the embedding in decoder when
            both input and output of the encoder and decoder are in same vector
            space.)

        As mentioned above, since that the options are not disjoint, I'll list
        some possible combination below, which are reasonable to be tested and
        compared:

        1) en_embedding=True, share_embedding=True, \
                use_embedding=True (dialogue generation)
        2) en_embedding=True, share_embedding=True, \
                use_embedding=False (dialogue generation)
        3) en_embedding=True, share_embedding=False, \
                use_embedding=True (semantic form to NL)
        4) en_embedding=False, share_embedding=X(don't care), \
                use_embedding=True (semantic form to NL)
        5) en_embedding=True, share_embedding=False, \
                use_embedding=False (semantic form to NL)
        6) en_embedding=False, share_embedding=X(don't care), \
                use_embedding=False (semantic form to NL)

        """
        # embedding layer setting
        if not en_embedding:
            en_embed = None
            de_embed = nn.Embedding(de_vocab_size, de_embedding_dim)
            if use_embedding:
                de_embed.weight = embeddings
                if not finetune_embedding:
                    de_embed.weight.requires_grad = False
        else:
            if share_embedding:
                embed = nn.Embedding(vocab_size, embedding_dim)
                if use_embedding:
                    embed.weight = embeddings
                    if not finetune_embedding:
                        embed.weight.requires_grad = False
                en_embed = embed
                de_embed = embed
            else:
                en_embed = nn.Embedding(en_vocab_size, en_embedding_dim)
                de_embed = nn.Embedding(de_vocab_size, de_embedding_dim)
                if use_embedding:
                    # in E2ENLG dataset, only decoder use word embedding
                    de_embed.weight = embeddings
                    if not finetune_embedding:
                        de_embed.weight.requires_grad = False

        self.encoder = EncoderRNN(
            en_embedding=en_embedding,
            embedding=en_embed,
            en_vocab_size=en_vocab_size,
            en_embedding_dim=(embedding_dim if share_embedding and en_embedding
                              else en_embedding_dim),
            hidden_size=en_hidden_size,
            n_layers=n_en_layers,
            bidirectional=bidirectional,
            cell=cell)

        self.cell = cell
        self.decoders = []
        for n in range(n_decoders):
            decoder = DecoderRNN(
                embedding=de_embed,
                de_vocab_size=de_vocab_size,
                de_embedding_dim=(embedding_dim if share_embedding
                                  and en_embedding else self.de_embedding_dim),
                en_hidden_size=en_hidden_size,
                de_hidden_size=de_hidden_size,
                n_en_layers=n_en_layers,
                n_de_layers=n_de_layers,
                bidirectional=bidirectional,
                feed_last=(True if feed_last and n > 0 else False),
                batch_norm=batch_norm,
                attn_method=attn_method,
                cell=cell)
            self.decoders.append(decoder)

        self.encoder = self.encoder.cuda() if use_cuda else self.encoder
        self.decoders = [
            decoder.cuda() if use_cuda else decoder
            for decoder in self.decoders
        ]

        # Initialize data loaders and optimizers
        self.train_data_loader = DataLoader(train_data_engine,
                                            batch_size=batch_size,
                                            shuffle=True,
                                            num_workers=1,
                                            drop_last=True,
                                            collate_fn=collate_fn,
                                            pin_memory=True)

        self.test_data_loader = DataLoader(test_data_engine,
                                           batch_size=batch_size,
                                           shuffle=False,
                                           num_workers=1,
                                           drop_last=True,
                                           collate_fn=collate_fn,
                                           pin_memory=True)

        # encoder parameters optimization
        self.encoder_parameters = filter(lambda p: p.requires_grad,
                                         self.encoder.parameters())
        self.encoder_optimizer = build_optimizer(en_optimizer,
                                                 self.encoder_parameters,
                                                 en_learning_rate)
        # decoder parameters optimization
        decoder_parameters = []
        for decoder in self.decoders:
            decoder_parameters.extend(list(decoder.parameters()))
        self.decoder_parameters = filter(lambda p: p.requires_grad,
                                         decoder_parameters)
        self.decoder_optimizer = build_optimizer(de_optimizer,
                                                 self.decoder_parameters,
                                                 de_learning_rate)

        print_time_info("Model create complete")
        # check directory and model existence
        Y, M, D, h, m, s = get_time()
        if not replace_model:
            self.model_dir = os.path.join(
                self.model_dir,
                "{}{:0>2}{:0>2}_{:0>2}{:0>2}{:0>2}".format(Y, M, D, h, m, s))

        if not os.path.isdir(self.model_dir):
            os.makedirs(self.model_dir)
        else:
            if not is_load:
                check_dir(self.model_dir)

        self.log_dir = os.path.join(
            self.log_dir,
            "{}{:0>2}{:0>2}_{:0>2}{:0>2}{:0>2}".format(Y, M, D, h, m, s))

        if not os.path.isdir(self.log_dir):
            os.makedirs(self.log_dir)
            os.makedirs(os.path.join(self.log_dir, "validation"))

        with open(os.path.join(self.log_dir, "model_config"), "w+") as f:
            for arg in vars(model_config):
                f.write("{}: {}\n".format(arg, str(getattr(model_config,
                                                           arg))))
            f.close()

        if is_load:
            self.load_model(model_dir)

        # Initialize the log files
        self.logger = Logger(self.log_dir)
        self.train_log_path = os.path.join(self.log_dir, "train_log.csv")
        self.valid_batch_log_path = os.path.join(self.log_dir,
                                                 "valid_batch_log.csv")
        self.valid_epoch_log_path = os.path.join(self.log_dir,
                                                 "valid_epoch_log.csv")

        with open(self.train_log_path, 'w') as file:
            file.write("epoch, batch, loss, avg-bleu, avg-rouge(1,2,L,BE)\n")
        with open(self.valid_batch_log_path, 'w') as file:
            file.write("epoch, batch, loss, avg-bleu, avg-rouge(1,2,L,BE)\n")
        with open(self.valid_epoch_log_path, 'w') as file:
            file.write("epoch, loss, avg-bleu, avg-rouge(1,2,L,BE)\n")

        # Initialize batch count
        self.batches = 0

        self.en_use_attr_init_state = en_use_attr_init_state
示例#28
0
import os
import re
import numpy as np
from numpy import random
from utils import check_dir
from scipy.io import loadmat
from scipy.io import savemat

#联合频域和LSTM特征再保存
LSTM_feature_path = "../LSTM_feature"
frequency_feature_path = "../frequency_feature/out"
stack_feature_path = "../stack_feature"
check_dir(stack_feature_path)
sample_list = os.listdir(LSTM_feature_path)

for sample in sample_list:
    loaded_sample_LSTM = loadmat(os.path.join(LSTM_feature_path, sample))
    LSTM_feature = loaded_sample_LSTM['temp']
    loaded_sample_frequency = loadmat(
        os.path.join(frequency_feature_path,
                     sample[0:sample.index('.')] + '_frequency.mat'))
    frequency_feature = loaded_sample_frequency['data']
    stack_feature = np.hstack((LSTM_feature, frequency_feature))
    savemat(os.path.join(stack_feature_path, sample), {'temp': stack_feature})

path = '../stack_feature'
output_path = '../txt/stack_feature'
check_dir(path)
check_dir(output_path)

data = []
示例#29
0
def proc(log_file, thread, circ_file, hisat_bam, rnaser_file, reads, outdir, prefix, anchor, lib_type):
    """
    Build pseudo circular reference index and perform reads re-alignment
    Extract BSJ and FSJ reads from alignment results

    Returns
    -----
    str
        output file name

    """
    from utils import check_dir
    circ_dir = '{}/circ'.format(outdir)
    check_dir(circ_dir)

    circ_fasta = '{}/circ/{}_index.fa'.format(outdir, prefix)
    circ_info = load_bed(circ_file)
    if rnaser_file:
        LOGGER.info('Loading RNase R results')
        rnaser_exp, rnaser_stat = update_info(circ_info, rnaser_file)

    # extract fasta file for reads alignment
    generate_index(log_file, circ_info, circ_fasta)

    # hisat2-build index
    denovo_index = build_index(log_file, thread, circ_fasta, outdir, prefix)
    LOGGER.debug('De-novo index: {}'.format(denovo_index))

    # hisat2 de novo alignment for candidate reads
    denovo_bam = denovo_alignment(log_file, thread, reads, outdir, prefix)
    LOGGER.debug('De-novo bam: {}'.format(denovo_bam))

    # Find BSJ and FSJ informations
    cand_bsj = proc_denovo_bam(denovo_bam, thread, circ_info, anchor, lib_type)
    bsj_reads, fsj_reads = proc_genome_bam(hisat_bam, thread, circ_info, cand_bsj, anchor, circ_dir)

    total_reads, mapped_reads = bam_stat(hisat_bam)
    circ_reads = sum([len(bsj_reads[i]) for i in bsj_reads]) * 2
    sample_stat = (total_reads, mapped_reads, circ_reads)

    sample_exp = expression_level(circ_info, bsj_reads, fsj_reads)

    # circRNA annotation
    header = [
        'Sample: {}'.format(prefix),
        'Total_Reads: {}'.format(total_reads),
        'Mapped_Reads: {}'.format(mapped_reads),
        'Circular_Reads: {}'.format(circ_reads),
    ]
    out_file = '{}/{}.gtf'.format(outdir, prefix)

    if rnaser_file:
        import coeff
        tmp_header, circ_exp = coeff.correction(sample_exp, sample_stat, rnaser_exp, rnaser_stat)
        header += tmp_header
    else:
        circ_exp = sample_exp

    from version import __version__
    header += ['version: {}'.format(__version__), ]
    gtf_info = index_annotation(utils.GTF)
    format_output(circ_info, circ_exp, sample_stat, header, gtf_info, out_file)

    return out_file
示例#30
0
from config import *
import os
import numpy as np
import mxnet as mx
from model.simple_stack import SimpleStack
from utils import check_dir
from memory import Memory
from environments.SimpleEnv import SimpleEnv
from mxnet import gluon

if os.path.exists(summary):
    os.remove(summary)
ctx = mx.cpu()
for i in ["model_save", "data_save"]:
    check_dir(i)
# build models
online_model = SimpleStack()
offline_model = SimpleStack()
online_model.collect_params().initialize(mx.init.Normal(0.02), ctx=ctx)
offline_model.collect_params().initialize(mx.init.Normal(0.02), ctx=ctx)
offline_model.collect_params().zero_grad()
# create env
env = SimpleEnv(display=True)
env.reset_env()
memory_pool = Memory(memory_length)
annealing = 0
total_reward = np.zeros(num_episode)
eval_result = []
loss_func = gluon.loss.L2Loss()
trainer = gluon.Trainer(offline_model.collect_params(), 'adam',
                        {'learning_rate': lr})
示例#31
0
def mode_train(train_loader, dev_loader, train_size_aug, dev_size_aug):
    check_dir(save_root_dir + '/' + model_name)

    device = torch.device('cuda')

    if model_pretrained:
        print('Loading pretrained model from {}'.format(save_root_dir + '/' + model_pretrained + '/model.pth'))
        model = torch.load(save_root_dir + '/' + model_pretrained + '/model.pth', map_location=device)
    else:
        model = VSNet(num_classes=num_classes)
        model = nn.DataParallel(model, device_ids=[0, 1, 2, 3])

    # criterion = nn.MSELoss(reduction='sum')
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

    model.to(device)

    scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=gamma)

    tb.configure(save_root_dir + '/' + model_name)

    start_time = time.time()

    tb_count = 0
    for epoch in range(num_epochs):

        scheduler.step()

        # Training
        model.train()
        running_loss = 0.0
        for i, sample in enumerate(train_loader, 0):
            if i == 1 and epoch == 0:
                start_time = time.time()
            img_a, img_b, label = sample

            optimizer.zero_grad()

            img_a = img_a.to(device)
            img_b = img_b.to(device)
            label = label.to(device)

            output = model(img_a, img_b)

            loss = combined_loss_quat(output, label, weights=weights)

            loss.backward()

            optimizer.step()

            running_loss += loss.item() * output.shape[0]

            output = output.cpu().detach().numpy()
            label = label.cpu().detach().numpy()

            error = np.zeros(8)

            for j in range(output.shape[0]):
                error[:3] += np.abs(output[j, :3] - label[j, :3])

                quat_output = normalize_q(output[j, 3:])
                quat_label = label[j, 3:]

                axis_output, angle_output = axis_angle_from_quat(quat_output)
                axis_label, angle_label = axis_angle_from_quat(quat_label)

                error_mag = np.abs(angle_output - angle_label)
                error_mag = error_mag if error_mag < np.pi else error_mag - np.pi
                error_dir = angle_between_vectors(axis_output, axis_label)
                error[3] += np.nan_to_num(error_mag)
                error[4] += np.nan_to_num(error_dir)

                rpy_output = np.array(euler_from_quaternion(quat_output))
                rpy_label = np.array(euler_from_quaternion(quat_label))
                error[5:] += np.abs(rpy_output - rpy_label)

            error /= output.shape[0]
            error[:3] *= 1000
            error[3:] = np.rad2deg(error[3:])
            est_time = (time.time() - start_time) / (epoch * len(train_loader) + i + 1) * (
                    num_epochs * len(train_loader))
            est_time = str(datetime.timedelta(seconds=est_time))
            print(
                '[TRAIN][{}][EST:{}] Epoch {}, Batch {}, Loss = {:0.7f}, error: x={:0.2f}mm,y={:0.2f}mm,z={:0.2f}mm,mag={:0.2f}deg,dir={:0.2f}deg,roll={:0.2f}deg,pitch={:0.2f}deg,yaw={:0.2f}deg'.format(
                    time.time() - start_time, est_time, epoch + 1, i + 1,
                    loss.item(), *error))

            tb.log_value(name='Loss', value=loss.item(), step=tb_count)
            tb.log_value(name='x/mm', value=error[0], step=tb_count)
            tb.log_value(name='y/mm', value=error[1], step=tb_count)
            tb.log_value(name='z/mm', value=error[2], step=tb_count)
            tb.log_value(name='mag/deg', value=error[3], step=tb_count)
            tb.log_value(name='dir/deg', value=error[4], step=tb_count)
            tb.log_value(name='roll/deg', value=error[5], step=tb_count)
            tb.log_value(name='pitch/deg', value=error[6], step=tb_count)
            tb.log_value(name='yaw/deg', value=error[7], step=tb_count)
            tb_count += 1

        # Dev eval
        model.eval()
        with torch.no_grad():
            running_error_dev = np.zeros(8)
            # running_error_dev = np.zeros(2)
            for i, sample in enumerate(dev_loader, 0):
                img_a, img_b, label = sample

                img_a = img_a.to(device)
                img_b = img_b.to(device)

                output = model(img_a, img_b)

                output = output.cpu().detach().numpy()

                label = label.numpy()

                error = np.zeros(8)
                # error = np.zeros(2)

                for j in range(output.shape[0]):
                    error[:3] += np.abs(output[j, :3] - label[j, :3])

                    quat_output = normalize_q(output[j, 3:])
                    quat_label = label[j, 3:]

                    axis_output, angle_output = axis_angle_from_quat(quat_output)
                    axis_label, angle_label = axis_angle_from_quat(quat_label)

                    error_mag = np.abs(angle_output - angle_label)
                    error_mag = error_mag if error_mag < np.pi else error_mag - np.pi
                    error_dir = angle_between_vectors(axis_output, axis_label)
                    error[3] += np.nan_to_num(error_mag)
                    error[4] += np.nan_to_num(error_dir)

                    rpy_output = np.array(euler_from_quaternion(quat_output))
                    rpy_label = np.array(euler_from_quaternion(quat_label))
                    error[5:] += np.abs(rpy_output - rpy_label)

                error[:3] *= 1000
                error[3:] = np.rad2deg(error[3:])

                running_error_dev += error
                error /= output.shape[0]

                print(
                    '[EVAL][{}] Epoch {}, Batch {}, error: x={:0.2f}mm,y={:0.2f}mm,z={:0.2f}mm,mag={:0.2f}deg,dir={:0.2f}deg'.format(
                        time.time() - start_time, epoch + 1, i + 1, *error))

        average_loss = running_loss / train_size_aug
        average_error = running_error_dev / dev_size_aug
        print(
            '[SUMMARY][{}] Summary: Epoch {}, loss = {:0.7f}, dev_eval: x={:0.2f}mm,y={:0.2f}mm,z={:0.2f}mm,mag={:0.2f}deg,dir={:0.2f}deg,roll={:0.2f}deg,pitch={:0.2f}deg,yaw={:0.2f}deg\n\n'.format(
                time.time() - start_time, epoch + 1, average_loss, *average_error))

        tb.log_value(name='Dev loss', value=average_loss, step=epoch)
        tb.log_value(name='Dev x/mm', value=average_error[0], step=epoch)
        tb.log_value(name='Dev y/mm', value=average_error[1], step=epoch)
        tb.log_value(name='Dev z/mm', value=average_error[2], step=epoch)
        tb.log_value(name='Dev mag/deg', value=average_error[3], step=epoch)
        tb.log_value(name='Dev dir/deg', value=average_error[4], step=epoch)
        tb.log_value(name='Dev roll/deg', value=average_error[5], step=epoch)
        tb.log_value(name='Dev pitch/deg', value=average_error[6], step=epoch)
        tb.log_value(name='Dev yaw/deg', value=average_error[7], step=epoch)

        torch.save(model, save_root_dir + '/' + model_name + '/model.pth')
        print('Model saved at {}/{}/model.pth'.format(save_root_dir, model_name))
示例#32
0
import discord
import logging
from discord.ext import commands
import utils
import sys
import os

CONFIG_FILE = 'discordbot.config'

options = utils.get_opts(sys.argv[1:])

if not utils.check_dir('logs'):
    os.mkdir('logs')
else:
    logger = logging.getLogger('discord')
    logger.setLevel(logging.INFO)   # Change this to get DEBUG info if necessary
    handler = logging.FileHandler(filename='logs/discordbot.log', encoding='utf-8', mode='w')
    handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(handler)


if options.config:
    config = utils.read_config(file=options.config)
else:
    config = utils.read_config()
logger.info(f'Reading Configuration file: {config}')


logger.info('Starting bot...')
bot = commands.Bot(command_prefix=utils.get_prefix, description=config['description'])
示例#33
0
argparser.add_argument("--alignroot", metavar="Alignment Dir", type=str, required=True)
argparser.add_argument("--treeroot", metavar="Tree Dir", type=str, required=True)
argparser.add_argument("--outroot", metavar="Output Dir + working dir + logdir", type=str, required=True)

args = argparser.parse_args()

alignroot = args.alignroot
treeroot = args.treeroot
outroot = args.outroot

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# Create log directory and out directory

check_dir(outroot)

out_pre_dir = path.join(outroot, "out")
check_dir(out_pre_dir)

logdir = path.join(outroot, "logs")
check_dir(logdir)

if os.getcwd() != out_pre_dir:
    os.chdir(out_pre_dir)
    

for infile in glob(path.join(alignroot, "*/*.phy")):
    print infile
    
    with open(infile, 'r') as f:
示例#34
0
def parse_args():
	desc = '''TensorFlow 2.0 implementation of Unsupervised Generative Attentional Networks with 
			  Adaptive Layer-Instance Normalization for Image-to-Image Translation (U-GAT-IT)'''
	parser = argparse.ArgumentParser(description=desc)

	parser.add_argument('--dataset_name', type=str, default='selfie2anime')
	parser.add_argument('--phase', type=str, default='tfrecord', choices=('tfrecord', 'train', 'test'))
	parser.add_argument('--img_size', type=int, default=256)
	parser.add_argument('--img_nc', type=int, default=3)
	parser.add_argument('--batch_size', type=int, default=1)

	parser.add_argument('--lr', type=float, default=0.0001)
	parser.add_argument('--iteration', type=int, default=10000)
	parser.add_argument('--epochs', type=int, default=50)
	parser.add_argument('--decay_epochs', type=int, default=50)
	parser.add_argument('--w_adv', type=float, default=1)
	parser.add_argument('--w_cyc', type=float, default=10)
	parser.add_argument('--w_rec', type=float, default=10)
	parser.add_argument('--w_cam', type=float, default=1000)
	parser.add_argument('--gan_type', type=str, default='lsgan', choices=('vanilla', 'lsgan', 'hinge'))

	parser.add_argument('--log_freq', type=int, default=1000)
	parser.add_argument('--output_dir', type=str, default='output')
	parser.add_argument('--log_dir', type=str, default='log')
	parser.add_argument('--sample_dir', type=str, default='sample')
	parser.add_argument('--save_dir', type=str, default='model')
	parser.add_argument('--result_dir', type=str, default='result')

	args = parser.parse_args()
	check_dir(args.output_dir)
	args.output_dir = os.path.join(args.output_dir, f'UGATIT_{args.dataset_name}')
	check_dir(args.output_dir)
	args.log_dir = os.path.join(args.output_dir, args.log_dir)
	check_dir(args.log_dir)
	args.sample_dir = os.path.join(args.output_dir, args.sample_dir)
	check_dir(args.sample_dir)
	args.save_dir = os.path.join(args.output_dir, args.save_dir)
	check_dir(args.save_dir)
	args.result_dir = os.path.join(args.output_dir, args.result_dir)
	check_dir(args.result_dir)

	return args
示例#35
0
def main():
    args = parser.parse_args()

    # torch setting
    torch.random.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # os setting
    path = args.dataset_path
    train_path = os.path.join(path, "train/train.txt")
    validation_path = os.path.join(path, "valid/valid.txt")
    test_path = os.path.join(path, "test/test.txt")
    params_path = os.path.join(args.model_dir, 'params.json')
    checkpoint_dir = os.path.join(args.model_dir, 'checkpoint')
    tensorboard_log_dir = os.path.join(args.model_dir, 'log')
    utils.check_dir(tensorboard_log_dir)

    entity2id, relation2id = data_loader.create_mappings(train_path)

    # params
    params = utils.Params(params_path)
    params.device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # dataset
    train_set = data_loader.FB15KDataset(train_path, entity2id, relation2id)
    train_generator = torch_data.DataLoader(train_set,
                                            batch_size=params.batch_size)
    validation_set = data_loader.FB15KDataset(validation_path, entity2id,
                                              relation2id)
    validation_generator = torch_data.DataLoader(
        validation_set, batch_size=params.validation_batch_size)
    test_set = data_loader.FB15KDataset(test_path, entity2id, relation2id)
    test_generator = torch_data.DataLoader(
        test_set, batch_size=params.validation_batch_size)

    # model
    model = net.Net(entity_count=len(entity2id),
                    relation_count=len(relation2id),
                    dim=params.embedding_dim,
                    margin=params.margin,
                    device=params.device,
                    norm=params.norm)  # type: torch.nn.Module
    model = model.to(params.device)
    optimizer = optim.SGD(model.parameters(), lr=params.learning_rate)
    summary_writer = tensorboard.SummaryWriter(log_dir=tensorboard_log_dir)
    start_epoch_id = 1
    step = 0
    best_score = 0.0

    print("Training Dataset: entity: {} relation: {} triples: {}".format(
        len(entity2id), len(relation2id), len(train_set)))
    print("Validation Dataset: triples: {}".format(len(validation_set)))
    print("Test Dataset: triples: {}".format(len(test_set)))
    print(model)

    # Train
    for epoch_id in range(start_epoch_id, params.epochs + 1):
        print("Epoch {}/{}".format(epoch_id, params.epochs))

        loss_impacting_samples_count = 0
        samples_count = 0
        model.train()

        with tqdm(total=len(train_generator)) as t:
            for local_heads, local_relations, local_tails in train_generator:
                local_heads, local_relations, local_tails = (local_heads.to(
                    params.device), local_relations.to(
                        params.device), local_tails.to(params.device))

                positive_triples = torch.stack(
                    (local_heads, local_relations, local_tails), dim=1)

                # Preparing negatives.
                # Generate binary tensor to replace either head or tail. 1 means replace head, 0 means replace tail.
                head_or_tail = torch.randint(high=2,
                                             size=local_heads.size(),
                                             device=params.device)
                random_entities = torch.randint(high=len(entity2id),
                                                size=local_heads.size(),
                                                device=params.device)
                broken_heads = torch.where(head_or_tail == 1, random_entities,
                                           local_heads)
                broken_tails = torch.where(head_or_tail == 0, random_entities,
                                           local_tails)
                negative_triples = torch.stack(
                    (broken_heads, local_relations, broken_tails), dim=1)

                optimizer.zero_grad()

                loss, pd, nd = model(positive_triples, negative_triples)
                loss.mean().backward()

                summary_writer.add_scalar('Loss/train',
                                          loss.mean().data.cpu().numpy(),
                                          global_step=step)
                summary_writer.add_scalar('Distance/positive',
                                          pd.sum().data.cpu().numpy(),
                                          global_step=step)
                summary_writer.add_scalar('Distance/negative',
                                          nd.sum().data.cpu().numpy(),
                                          global_step=step)

                loss = loss.data.cpu()
                loss_impacting_samples_count += loss.nonzero().size()[0]
                samples_count += loss.size()[0]

                optimizer.step()
                step += 1

                t.set_postfix(loss=loss_impacting_samples_count /
                              samples_count * 100)
                t.update()

            summary_writer.add_scalar('Metrics/batch_loss',
                                      loss_impacting_samples_count /
                                      samples_count * 100,
                                      global_step=epoch_id)

            # validation
            if epoch_id % params.validation_freq == 0:
                model.eval()
                _, _, hits_at_10, _ = evaluate(
                    model=model,
                    data_generator=validation_generator,
                    entities_count=len(entity2id),
                    device=params.device,
                    summary_writer=summary_writer,
                    epoch_id=epoch_id,
                    metric_suffix="val")
                score = hits_at_10
                if score > best_score:
                    best_score = score
                    utils.save_checkpoint(checkpoint_dir, model, optimizer,
                                          epoch_id, step, best_score)

    # Testing the best checkpoint on test dataset
    utils.load_checkpoint(checkpoint_dir, model, optimizer)
    best_model = model.to(params.device)
    best_model.eval()
    scores = evaluate(model=best_model,
                      data_generator=test_generator,
                      entities_count=len(entity2id),
                      device=params.device,
                      summary_writer=summary_writer,
                      epoch_id=1,
                      metric_suffix="test")
    print("Test scores: \n hit%1: {} \n hit%3: {} \nh it%10: {} \n mrr: {}".
          format(scores[0], scores[1], scores[2], scores[3]))

    eval_path = os.path.join(args.model_dir, 'eval.json')
    evals_params = utils.Params(eval_path)
    evals_params.hit_1 = scores[0]
    evals_params.hit_3 = scores[1]
    evals_params.hit_10 = scores[2]
    evals_params.mrr = scores[3]
    evals_params.best_score = best_score
    evals_params.save(eval_path)
示例#36
0
def test(model, test_loader, conf, logger, epoch, best_random_error):
    model.eval()
    acc = 0.  # Accuracy
    random_error_avg = 0.0
    random_precision_avg = 0.0
    random_recall_avg = 0.0
    false_split_avg = 0.0
    false_merge_avg = 0.0
    length = 0

    # here we store the 5 test images in the same big image
    result_store_dir = os.path.join(conf['exp_dir'], 'result')
    if conf['rank'] == 0:
        check_dir(result_store_dir)
    store_path_fmt = os.path.join(result_store_dir, 'epoch-{}-{}.png')

    # here we store each predicted image in a .png
    result_single_image_dir = os.path.join(conf['exp_dir'], 'result_single',
                                           'epoch-{}'.format(epoch))
    dist.barrier()

    with torch.no_grad():
        for iter_idx, (images, labels, _) in enumerate(test_loader):
            images = images.to(conf['device'])
            labels = labels.to(conf['device'])
            seg_res = model(images)
            seg_prob = torch.sigmoid(seg_res)

            acc += get_accuracy(seg_prob, labels)
            random_error, random_precision, random_recall, false_split, false_merge = get_metric_val(
                labels, seg_prob)
            random_error_avg += random_error
            random_precision_avg += random_precision
            random_recall_avg += random_recall
            false_split_avg += false_split
            false_merge_avg += false_merge
            length += images.size(0)

            if epoch % conf['save_per_epoch'] == 0 and conf['rank'] == 0:

                torchvision.utils.save_image(
                    images.data.cpu() + 0.5,
                    store_path_fmt.format(epoch, 'image'))
                torchvision.utils.save_image(
                    labels.data.cpu(), store_path_fmt.format(epoch, 'GT'))
                torchvision.utils.save_image(
                    seg_prob.data.cpu(), store_path_fmt.format(epoch, 'SR'))
                torchvision.utils.save_image(
                    (seg_prob > 0.5).float().data.cpu(),
                    store_path_fmt.format(epoch, 'PRE'))

                check_dir(result_single_image_dir)
                for i in range(seg_prob.shape[0]):
                    store_path = os.path.join(result_single_image_dir,
                                              '{}.png'.format(i))
                    torchvision.utils.save_image(
                        (seg_prob > 0.5).float()[i].data.cpu(), store_path)
                    store_path = os.path.join(result_single_image_dir,
                                              '{}-prob.png'.format(i))
                    torchvision.utils.save_image(seg_prob[i].data.cpu(),
                                                 store_path)

    acc = acc / len(test_loader)
    random_error_avg /= len(test_loader)
    random_precision_avg /= len(test_loader)
    random_recall_avg /= len(test_loader)
    false_split_avg /= len(test_loader)
    false_merge_avg /= len(test_loader)

    if random_error_avg < best_random_error and conf['rank'] == 0:
        torchvision.utils.save_image(images.data.cpu() + 0.5,
                                     store_path_fmt.format('Best', 'image'))
        torchvision.utils.save_image(labels.data.cpu(),
                                     store_path_fmt.format('Best', 'GT'))
        torchvision.utils.save_image(seg_prob.data.cpu(),
                                     store_path_fmt.format('Best', 'SR'))
        torchvision.utils.save_image((seg_prob > 0.5).float().data.cpu(),
                                     store_path_fmt.format('Best', 'PRE'))
        result_single_image_dir = os.path.join(conf['exp_dir'],
                                               'result_single',
                                               'Best'.format(epoch))
        check_dir(result_single_image_dir)
        for i in range(seg_prob.shape[0]):
            store_path = os.path.join(result_single_image_dir,
                                      '{}.png'.format(i))
            torchvision.utils.save_image(
                (seg_prob > 0.5).float()[i].data.cpu(), store_path)
            store_path = os.path.join(result_single_image_dir,
                                      '{}-prob.png'.format(i))
            torchvision.utils.save_image(seg_prob[i].data.cpu(), store_path)

    # if conf['rank'] == 0:
    #     logger.info("[Test] Rank: {} Epoch: [{}/{}] Acc: {:.3f}".format(conf['rank'],
    #                                                         epoch, conf['num_epochs'],
    #                                                         acc))
    if conf['rank'] == 0:
        logger.info(
            "[Test] Rank: {} Epoch: [{}/{}] Acc: {:.3f} R_error: {:.3f} R_pre: {:.3f} R_recall: {:.3f}"
            " F_split: {:.2f} F_merge: {:.2f}".format(
                conf['rank'], epoch, conf['num_epochs'], acc, random_error_avg,
                random_precision_avg, random_recall_avg, false_split_avg,
                false_merge_avg))

    return acc, random_error_avg, random_precision_avg, random_recall_avg, false_split_avg, false_merge_avg
示例#37
0
def main():
    textgrid_folder = sys.argv[1]
    out_dir = sys.argv[2] + '/out/'
    utils.check_dir(out_dir)
    create_files(textgrid_folder, out_dir)
示例#38
0
def format_alignment(fasta, tree, outdir):
    
    treeroot = tree
    fastaroot = path.join(fasta, "*/*prank.best.fas")
    
    check_dir(outdir)


    for infile in glob(fastaroot):
        
        # print progress
        print infile
        
        basename = path.basename(infile).partition('.')[0]
        basename = "".join(basename.split("_")[0] + "_" + basename.split("_")[1])
        prefix = basename.partition('_')[0][:2]
        
        fastafile = infile 
        treedir = path.join(treeroot, prefix)
        treefile = path.join(treedir, basename + '.nh')
        
        # create the first 2 directories (fasta_out, fasta_AA_out)
        
        fasta_out_dir = path.join(outdir, "fasta")
        check_dir(fasta_out_dir)
        fasta_AA_out_dir = path.join(outdir, "fasta_AA")
        check_dir(fasta_AA_out_dir)
        
        fasta_out_subdir = path.join(fasta_out_dir, prefix)
        check_dir(fasta_out_subdir)
        fasta_out_file_path = path.join(fasta_out_subdir, "".join(basename + ".fa"))
        fasta_AA_out_subdir = path.join(fasta_AA_out_dir, prefix)
        check_dir(fasta_AA_out_subdir)
        fasta_AA_out_file_path = path.join(fasta_AA_out_subdir, "".join(basename + ".fa"))
        
        fasta_out_file = open(fasta_out_file_path, "w")
        fasta_AA_out_file = open(fasta_AA_out_file_path, "w")        

          
        for ID in SeqIO.parse(fastafile,"fasta", alphabet=IUPAC.unambiguous_dna):
            
            tree_ids = Tree(newick=treefile)
            for tree_id in tree_ids.iter_leaf_names():
                
                if tree_id.find(ID.id) != -1:
                    #print ID.id
                    ID.id = tree_id
                    #ID.name = ""
                    ID.description = ""
                    #print ID.id
                    #print ID
                    
                    # write the normal fasta out
                    SeqIO.write(ID, fasta_out_file, "fasta")
                    
                    # translate cDNA and write AA fasta
                    aa_seq = []
                    coding_dna = ID.seq
                    #print coding_dna
                    for codon in grouper(coding_dna, 3):
                        cog = "".join(codon)
                        if cog == "---":
                            aa_seq.append("-")
                        else:
                            cog_aa = translate(cog)
                            aa_seq.append(cog_aa)
                    aa_seq = "".join(aa_seq)

                    ID = SeqRecord(Seq(aa_seq, IUPAC.protein), id = ID.id, name = ID.name)
                    ID.description = ""

                    SeqIO.write(ID, fasta_AA_out_file, "fasta")
                    
        fasta_out_file.close()
        fasta_AA_out_file.close()
        
        phy_out_dir = path.join(outdir, "phylip")
        check_dir(phy_out_dir)
        phy_AA_out_dir = path.join(outdir, "phylip_AA")
        check_dir(phy_AA_out_dir)
        
        phy_out_subdir = path.join(phy_out_dir, prefix)
        check_dir(phy_out_subdir)
        phy_out_file_path = path.join(phy_out_subdir, "".join(basename + ".phy"))
        phy_AA_out_subdir = path.join(phy_AA_out_dir, prefix)
        check_dir(phy_AA_out_subdir)
        phy_AA_out_file_path = path.join(phy_AA_out_subdir, "".join(basename + ".phy"))

        fasta_alignment = open(fasta_out_file_path, "rU")
        fasta_AA_alignment = open(fasta_AA_out_file_path, "rU")
        
        phy_out_file = open(phy_out_file_path, "w")
        phy_AA_out_file = open(phy_AA_out_file_path, "w")
                        
        alignments = AlignIO.parse(fasta_alignment, "fasta")
        AlignIO.write(alignments, phy_out_file, "phylip-relaxed")
        
        fasta_alignment.close()
        phy_out_file.close()

        alignments_AA = AlignIO.parse(fasta_AA_alignment, "fasta")       
        AlignIO.write(alignments_AA, phy_AA_out_file, "phylip-relaxed")

        fasta_AA_alignment.close()
        phy_AA_out_file.close()
示例#39
0
def main(config, rank, world_size, gpu_id, port, kwargs):
    torch.backends.cudnn.benchmark = True

    conf = parse_config_or_kwargs(config, **kwargs)

    # --------- multi machine train set up --------------
    if conf['train_local'] == 1:
        host_addr = 'localhost'
        conf['rank'] = rank
        conf['local_rank'] = gpu_id  # specify the local gpu id
        conf['world_size'] = world_size
        dist_init(host_addr, conf['rank'], conf['local_rank'],
                  conf['world_size'], port)
    else:
        host_addr = getoneNode()
        conf['rank'] = int(os.environ['SLURM_PROCID'])
        conf['local_rank'] = int(os.environ['SLURM_LOCALID'])
        conf['world_size'] = int(os.environ['SLURM_NTASKS'])
        dist_init(host_addr, conf['rank'], conf['local_rank'],
                  conf['world_size'], '2' + os.environ['SLURM_JOBID'][-4:])
        gpu_id = conf['local_rank']
    # --------- multi machine train set up --------------

    # setup logger
    if conf['rank'] == 0:
        check_dir(conf['exp_dir'])
        logger = get_logger_2(os.path.join(conf['exp_dir'], 'train.log'),
                              "[ %(asctime)s ] %(message)s")
    dist.barrier()  # let the rank 0 mkdir first
    if conf['rank'] != 0:
        logger = get_logger_2(os.path.join(conf['exp_dir'], 'train.log'),
                              "[ %(asctime)s ] %(message)s")

    logger.info("Rank: {}/{}, local rank:{} is running".format(
        conf['rank'], conf['world_size'], conf['rank']))

    # write the config file to the exp_dir
    if conf['rank'] == 0:
        store_path = os.path.join(conf['exp_dir'], 'config.yaml')
        store_yaml(config, store_path, **kwargs)

    cuda_id = 'cuda:' + str(gpu_id)
    conf['device'] = torch.device(
        cuda_id if torch.cuda.is_available() else 'cpu')

    model_dir = os.path.join(conf['exp_dir'], 'models')
    if conf['rank'] == 0:
        check_dir(model_dir)
    conf['checkpoint_format'] = os.path.join(model_dir, '{}.th')

    set_seed(666 + conf['rank'])

    if 'R' in conf['model_type']:
        model = eval(conf['model_type'])(base_ch_num=conf['base_ch_num'],
                                         t=conf['t'])
    else:
        model = eval(conf['model_type'])(base_ch_num=conf['base_ch_num'])
    model = model.to(conf['device'])
    model = DDP(model,
                device_ids=[conf['local_rank']],
                output_device=conf['local_rank'])
    optimizer = optim.Adam(model.parameters(),
                           lr=conf['lr'],
                           betas=(0.5, 0.99))

    if conf['rank'] == 0:
        num_params = sum(param.numel() for param in model.parameters())
        logger.info("Model type: {} Base channel num:{}".format(
            conf['model_type'], conf['base_ch_num']))
        logger.info("Number of parameters: {:.4f}M".format(1.0 * num_params /
                                                           1e6))
        logger.info(optimizer)

    train_set = ImageFolder(root=conf['root'],
                            mode='train',
                            augmentation_prob=conf['aug_prob'],
                            crop_size_min=conf['crop_size_min'],
                            crop_size_max=conf['crop_size_max'],
                            data_num=conf['data_num'],
                            gauss_size=conf['gauss_size'],
                            data_aug_list=conf['aug_list'])
    train_loader = DataLoader(dataset=train_set,
                              batch_size=conf['batch_size'],
                              shuffle=conf['shuffle'],
                              num_workers=conf['num_workers'])

    dev_set = ImageFolder(root=conf['root'],
                          mode='train',
                          augmentation_prob=0.0)
    dev_loader = DataLoader(dataset=dev_set,
                            batch_size=5,
                            shuffle=False,
                            num_workers=1)

    valid_set = ImageFolder(root=conf['root'], mode='valid')
    valid_loader = DataLoader(dataset=valid_set,
                              batch_size=5,
                              shuffle=False,
                              num_workers=1)

    test_set = ImageFolder(root=conf['root'], mode='test')
    test_loader = DataLoader(dataset=test_set,
                             batch_size=5,
                             shuffle=False,
                             num_workers=1)

    dist.barrier()  # synchronize here
    train(model, train_loader, test_loader, dev_loader, optimizer, conf,
          logger)
示例#40
0
文件: main.py 项目: zhaoqingpu/TF2GAN
def parse_args():
    desc = 'TensorFlow 2.0 implementation of Deep Convolutional Generative Adversarial Network (DCGAN)'
    parser = argparse.ArgumentParser(description=desc)

    parser.add_argument('--dataset_name', type=str, default='celeba')
    parser.add_argument('--phase',
                        type=str,
                        default='tfrecord',
                        choices=('tfrecord', 'train', 'test'))
    parser.add_argument('--img_size', type=int, default=64)
    parser.add_argument('--img_nc', type=int, default=3)
    parser.add_argument('--z_dim', type=int, default=100)

    parser.add_argument('--batch_size', type=int, default=100)
    parser.add_argument('--iteration', type=int, default=100000)
    parser.add_argument('--log_freq', type=int, default=1000)
    parser.add_argument('--sample_freq', type=int, default=1000)
    parser.add_argument('--save_freq', type=int, default=10000)
    parser.add_argument('--output_dir', type=str, default='output')
    parser.add_argument('--log_dir', type=str, default='log')
    parser.add_argument('--sample_dir', type=str, default='sample')
    parser.add_argument('--save_dir', type=str, default='model')
    parser.add_argument('--result_dir', type=str, default='result')

    parser.add_argument('--lr', type=float, default=0.0002)
    parser.add_argument('--gan_type',
                        type=str,
                        default='vanilla',
                        choices=('vanilla', 'lsgan', 'hinge'))

    args = parser.parse_args()
    check_dir(args.output_dir)
    args.output_dir = os.path.join(args.output_dir,
                                   f'DCGAN_{args.dataset_name}')
    check_dir(args.output_dir)
    args.log_dir = os.path.join(args.output_dir, args.log_dir)
    check_dir(args.log_dir)
    args.sample_dir = os.path.join(args.output_dir, args.sample_dir)
    check_dir(args.sample_dir)
    args.save_dir = os.path.join(args.output_dir, args.save_dir)
    check_dir(args.save_dir)
    args.result_dir = os.path.join(args.output_dir, args.result_dir)
    check_dir(args.result_dir)

    return args
示例#41
0
    def process(self, image, output_path=None, output_name=None):
        image_origin = image
        check_dir(output_path)

        cv2.imwrite(join(output_path, output_name + '.origin.png'),
                    image_origin)
        # image_name =
        # image_origin = cv2.imread(image_path)
        image_origin_height, image_origin_width = image_origin.shape[0:2]
        # print('Width', image_origin_width, 'Height', image_origin_height)

        image_crop, image_edge = format_image_rgb(image_origin)
        cv2.imwrite(join(output_path, output_name + '.landmark.crop.png'),
                    image_crop)
        # print('Image Data', image_crop、, 'Image Edge', image_edge)

        image_crop_resize = cv2.resize(image_crop, (128, 128))
        cv2.imwrite(join(output_path, output_name + '.landmark.resize.png'),
                    image_crop_resize)

        # image_data = cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB)
        # print('Image', image_crop_resize)

        predictions = self.sess.run(self.landmark_logits,
                                    feed_dict={self.inputs: image_crop_resize})
        # print(predictions)
        # print('Len predictions', predictions)

        marks = np.array(predictions).flatten()
        marks = np.reshape(marks, (-1, 2))
        # print(marks)

        # width =
        # print('Image edge shape', image_edge)
        # to do multiply
        marks *= (image_edge[2] - image_edge[0])
        marks[:, 0] += image_edge[0]
        marks[:, 1] += image_edge[1]
        # print(marks)

        with open(join(output_path, output_name + '.marks.txt'),
                  'w',
                  encoding='utf-8') as f:
            f.write(json.dumps(marks.tolist()))

        for mark in marks:
            cv2.circle(image_origin, tuple(mark), 3, (255, 0, 0))

        cv2.imwrite(join(output_path, output_name + '.landmark.png'),
                    image_origin)

        pose_estimator = PoseEstimator(img_size=(image_origin_height,
                                                 image_origin_width))
        # pose_estimator
        pose = pose_estimator.solve_pose_by_68_points(marks)
        print('Pose', pose)

        with open(join(output_path, output_name + '.pose.txt'),
                  'w',
                  encoding='utf-8') as f:
            f.write(json.dumps(pose))

        return pose
示例#42
0
from utils import check_dir

gpu_id = 0
batch_size = 1000

torch.cuda.set_device(gpu_id)

parser = InitParser()
net = Stack_Bi_LSTM(parser)
weights_path = "../output/fine_tuning_result/Network_fine_tuning.pth.gz"
weights = torch.load(weights_path,map_location='cuda:%d'%(gpu_id)) #GPU ###
net.load_state_dict(weights)
net.cuda().eval() #测试模式 GPU


all_data_set = MySet(parser.fine_tuning_txt_path , mode="all")
all_data_loader = DataLoader(all_data_set, batch_size=batch_size, shuffle=True)

output_path = '../LSTM_feature'
check_dir(output_path)

with torch.no_grad():
    for batch_idx, (sequence, label, name) in enumerate(all_data_loader):
        sequence = sequence.float().cuda()  #GPU
        label = label.data.numpy()
        predict,feature = net(sequence.permute(1,0,2))

feature = feature.data.cpu().numpy()
for i in range(feature.shape[0]):
    savemat(os.path.join(output_path,re.search('[a-z]*_[0-9]*.mat',name[i]).group()),{'temp':feature[i,:]})
示例#43
0
        nKnovel=opt.test_way,
        nKbase=0,
        nExemplars=opt.val_shot,  # num training examples per novel category
        nTestNovel=opt.val_query *
        opt.test_way,  # num test examples for all the novel categories
        nTestBase=0,  # num test examples for all the base categories
        batch_size=1,
        num_workers=0,
        epoch_size=1 * opt.val_episode,  # num of batches per epoch
    )

    if opt.aws == 1:
        set_gpu(opt.gpu)

    # check_dir('./experiments/')
    check_dir(opt.save_path)
    check_dir(opt.tensorboard_dir)

    # debug the GPU part
    # print("Device Count: ", torch.cuda.device_count())

    # print("Dev 1: ", torch.cuda.get_device_name(0))
    # print("Dev 2: ", torch.cuda.get_device_name(1))
    # print("Dev 3: ", torch.cuda.get_device_name(2))
    # print("Dev 4: ", torch.cuda.get_device_name(3))

    log_file_path = os.path.join(opt.save_path, "train_log.txt")

    print(log_file_path)

    log(log_file_path, str(vars(opt)))
示例#44
0
import utils
from utils import check_dir

fubar_cmd = "~sparks/hyphy-hyphyqt/HYPHYMP {0}"

argparser = argparse.ArgumentParser()
argparser.add_argument('--inroot', metavar='input_root', type=str, required=True)
argparser.add_argument('--logdir', metavar='log_dir', type=str, required=True)

args = argparser.parse_args()

inroot = args.inroot
logroot = args.logdir

utils.check_dir(logroot)

sizes = "Small", "Medium", "Big"
species_numbers = "6species", "12species", "17species", "44species"

# prepare each of the 12 directories with sequences for slr
for species in species_numbers:
    print species
    check_dir(path.join(inroot,species)) 
    check_dir(path.join(logroot, species))
    
    for size in sizes:
        print size
        check_dir(path.join(inroot, species, size))
        check_dir(path.join(logroot, species, size))
示例#45
0
argparser.add_argument("--outdir", metavar="Output Dir + working dir + logdir", type=str, required=True)
argparser.add_argument("--gpf", metavar="template Dir", type=str, required=True)

args = argparser.parse_args()

treedir = args.treedir
outdir = args.outdir
gpf = args.gpf


####### Prep ###########

sizes = "Small", "Medium", "Big"
species_numbers = "6species", "12species", "17species", "44species"

check_dir(outdir)
os.chdir(outdir)

# extract information out of the gpf (gideon pomeranz file)
parameters = open(gpf).read()
m = re.search("(?<=n_sites=)\w+", parameters)
n_sites = m.group(0)
n_sites = int(n_sites)

m = re.search("(?<=n_runs=)\w+", parameters)
n_runs = m.group(0)
n_runs = int(n_runs)

m = re.search("(?<=alphas=).+", parameters)
alphas = m.group(0)
alphas = alphas.split(",")
示例#46
0
inputRedirect["04"]="%s";
inputRedirect["05"]="20";
inputRedirect["06"]="5";
inputRedirect["07"]="2000000";
inputRedirect["08"]="1000000";
inputRedirect["09"]="100";
inputRedirect["10"]="0.5";
ExecuteAFile ("/nfs/research2/goldman/gregs/HBL/FUBAR/FUBAR.bf", inputRedirect);
"""    
argparser = argparse.ArgumentParser()
argparser.add_argument('--indir', metavar='input_directory', type=str, required=True)
argparser.add_argument('--outdir', metavar='input_directory', type=str, required=True)
argparser.add_argument('--clade', metavar='input_directory', type=str, required=True)
args = argparser.parse_args()

utils.check_dir(args.outdir)
utils.check_dir(path.join(args.outdir, args.clade))

def read_slr(fh):
    stats = fh.readline()
    seqs = []

    for l in utils.grouper(fh, 2):
        name = l[0].rstrip()
        seq = l[1].rstrip()
        seqs.append(SeqRecord(id=name, seq=Seq(seq), description=""))
        
    return seqs

for f in glob.glob(path.join(args.indir, args.clade,
                             '*', '*_slr.paml')):
示例#47
0
 def __set_cfg(self, cfgdir):
     '''Sets the config files and checks the directory'''
     cfgdir = os.path.join(cfgdir, self.name)
     utils.check_dir(cfgdir)
     self.sample_cfg = os.path.join(cfgdir, '{0.name}.cfg'.format(self))
     self.report_yaml = os.path.join(cfgdir, '{0.name}.build_reporter.yaml'.format(self))
示例#48
0
def test_lstm(**kwargs):
    """
    Wrapper function for training and testing LSTM

    :type fold: int
    :param fold: fold index of the ATIS dataset, from 0 to 4.

    :type lr: float
    :param lr: learning rate used (factor for the stochastic gradient).

    :type nepochs: int
    :param nepochs: maximal number of epochs to run the optimizer.

    :type win: int
    :param win: number of words in the context window.

    :type nhidden: int
    :param n_hidden: number of hidden units.

    :type emb_dimension: int
    :param emb_dimension: dimension of word embedding.

    :type verbose: boolean
    :param verbose: to print out epoch summary or not to.

    :type decay: boolean
    :param decay: decay on the learning rate if improvement stop.

    :type savemodel: boolean
    :param savemodel: save the trained model or not.

    :type normal: boolean
    :param normal: normalize word embeddings after each update or not.

    :type folder: string
    :param folder: path to the folder where results will be stored.

    """
    # process input arguments
    param = {
        'experiment': 'standard',
        'lr': 0.1,
        'verbose': True,
        'decay': True,
        'win': 3,
        'nhidden': 300,
        'nhidden2': 300,
        'seed': 345,
        'emb_dimension': 90,
        'nepochs': 40,
        'savemodel': False,
        'normal': True,
        'layer_norm': False,
        'minibatch_size': 4978,
        'folder': '../result'
    }

    param_diff = set(kwargs.keys()) - set(param.keys())
    if param_diff:
        raise KeyError("invalid arguments:" + str(tuple(param_diff)))
    param.update(kwargs)

    if param['verbose']:
        for k, v in param.items():
            print("%s: %s" % (k, v))

    # create result folder if not exists
    check_dir(param['folder'])

    # load the dataset
    print('... loading the dataset')
    train_set, valid_set, test_set, dic = load_data(3)

    train_set = list(train_set)
    valid_set = list(valid_set)

    # Add validation set to train set
    for i in range(3):
        train_set[i] += valid_set[i]

    # create mapping from index to label, and index to word
    idx2label = dict((k, v) for v, k in dic['labels2idx'].items())
    idx2word = dict((k, v) for v, k in dic['words2idx'].items())

    # unpack dataset
    train_lex, train_ne, train_y = train_set
    test_lex, test_ne, test_y = test_set

    n_trainbatches = len(train_lex) // param['minibatch_size']

    print("Sentences in train: %d, Words in train: %d" %
          (count_of_words_and_sentences(train_lex)))
    print("Sentences in test: %d, Words in test: %d" %
          (count_of_words_and_sentences(test_lex)))

    vocsize = len(dic['words2idx'])
    nclasses = len(dic['labels2idx'])
    nsentences = len(train_lex)

    groundtruth_test = [[idx2label[x] for x in y] for y in test_y]
    words_test = [[idx2word[x] for x in w] for w in test_lex]

    # instanciate the model
    numpy.random.seed(param['seed'])
    random.seed(param['seed'])

    print('... building the model')
    lstm = LSTM(n_hidden=param['nhidden'],
                n_hidden2=param['nhidden2'],
                n_out=nclasses,
                n_emb=vocsize,
                dim_emb=param['emb_dimension'],
                cwind_size=param['win'],
                normal=param['normal'],
                layer_norm=param['layer_norm'],
                experiment=param['experiment'])

    # train with early stopping on validation set
    print('... training')
    best_f1 = -numpy.inf
    param['clr'] = param['lr']
    for e in range(param['nepochs']):

        # shuffle
        shuffle([train_lex, train_ne, train_y], param['seed'])

        param['ce'] = e
        tic = timeit.default_timer()

        for minibatch_index in range(n_trainbatches):

            for i in range(minibatch_index * param['minibatch_size'],
                           (1 + minibatch_index) * param['minibatch_size']):
                x = train_lex[i]
                y = train_y[i]
                res = lstm.train(x, y, param['win'], param['clr'])

            predictions_test = [[
                idx2label[x] for x in lstm.classify(
                    numpy.asarray(contextwin(x, param['win'])).astype('int32'))
            ] for x in test_lex]

            # evaluation // compute the accuracy using conlleval.pl
            res_test = conlleval(predictions_test, groundtruth_test,
                                 words_test,
                                 param['folder'] + '/current.test.txt',
                                 param['folder'])

            if res_test['f1'] > best_f1:

                if param['savemodel']:
                    lstm.save(param['folder'])

                best_lstm = copy.deepcopy(lstm)
                best_f1 = res_test['f1']

                if param['verbose']:
                    print(
                        'NEW BEST: epoch %d, minibatch %d/%d, best test F1: %.3f'
                        % (e, minibatch_index + 1, n_trainbatches,
                           res_test['f1']))

                param['tf1'] = res_test['f1']
                param['tp'] = res_test['p']
                param['tr'] = res_test['r']
                param['be'] = e

                os.rename(param['folder'] + '/current.test.txt',
                          param['folder'] + '/best.test.txt')
            else:
                if param['verbose']:
                    print('')

        # learning rate decay if no improvement in 10 epochs
        if param['decay'] and abs(param['be'] - param['ce']) >= 10:
            param['clr'] *= 0.5
            print("Decay happened. New Learning Rate:", param['clr'])
            lstm = best_lstm

        if param['clr'] < 0.00001:
            break

    print('BEST RESULT: epoch', param['be'], 'best test F1', param['tf1'],
          'with the model', param['folder'])

    return lstm, dic
示例#49
0
    def __init__(self, queue, step, pid, voc_size, valid_data_flow):
        self.queue = queue
        self.valid_data_flow = valid_data_flow
        threading.Thread.__init__(self)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        self.train_in_seq = tf.placeholder(tf.int32,
                                           shape=[dataflow.batch_size, None],
                                           name='in_seq')
        self.train_in_seq_len = tf.placeholder(tf.int32,
                                               shape=[dataflow.batch_size],
                                               name='in_seq_len')
        self.train_target_seq = tf.placeholder(
            tf.int32, shape=[dataflow.batch_size, None], name='target_seq')
        self.train_target_seq_len = tf.placeholder(tf.int32,
                                                   shape=[dataflow.batch_size],
                                                   name='target_seq_len')

        self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')

        self.model = model.Seq2Seq()

        self.model.build(self.train_in_seq,
                         self.train_in_seq_len,
                         self.train_target_seq,
                         self.train_target_seq_len,
                         voc_size,
                         dataflow.hidden_unit,
                         dataflow.layers,
                         dataflow.dropout,
                         dataflow.learning_rate,
                         name_scope='train')
        self.model.build_infer(self.train_in_seq,
                               self.train_in_seq_len,
                               voc_size,
                               dataflow.hidden_unit,
                               dataflow.layers,
                               name_scope='infer')

        self.transfer = model.transfer_params(from_scope='train',
                                              to_sope='infer')
        self.sess.run(tf.global_variables_initializer())
        self.saver = Saver(self.sess)
        if start_step == 1:
            continue_train = False
        else:
            continue_train = True
        self.saver.auto_save_init(save_dir=dataflow.lstm_save_dir,
                                  save_interval=saveTime,
                                  max_keep=5,
                                  scope_name='train',
                                  continue_train=continue_train)
        self.saver.load(dataflow.init_path, scope_name='train', del_scope=True)

        print('Training Begin')

        self.step = step
        print('pid:{}'.format(pid))
        self.pid = pid

        if start_step == 1:
            if check_dir(dataflow.lstm_log_dir):
                del_dir_under(dataflow.lstm_log_dir)
            else:
                create_dir(dataflow.lstm_log_dir)
        else:
            assert check_dir(dataflow.lstm_log_dir)

        self.writer = tf.summary.FileWriter(dataflow.lstm_log_dir,
                                            self.sess.graph)
from draw import draw_all
from utils import check_dir
from utils import clean_temp_files
from utils import copy_resources_to_processed
from get_data_from_log_file import get_data_from_log_file


if __name__ == '__main__':
    model_name = 'densenet121-train90-lr0.1-batch768'
    # model_name = 'restnet18-train90-lr0.1'
    check_dir()
    get_data_from_log_file(log_file='%s.txt' % model_name)
    draw_all()
    copy_resources_to_processed(model_name=model_name, file_name='modified', replace=True)
    copy_resources_to_processed(model_name=model_name, file_name='plt', replace=True)
    # if you want to clean temp file under postprocess, use this
    clean_temp_files()
    print("All Done!")
    print("you can check resources in '../processed'")
    print("path in project is 'data/postprocess'")


示例#51
0
argparser.add_argument("--alignroot", metavar="Alignment Dir", type=str, required=True)
argparser.add_argument("--treeroot", metavar="Tree Dir", type=str, required=True)
argparser.add_argument("--outroot", metavar="Output Dir + working dir + logdir", type=str, required=True)

args = argparser.parse_args()

alignroot = args.alignroot
treeroot = args.treeroot
outroot = args.outroot

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# Create log directory and out directory

check_dir(outroot)

out_pre_dir = path.join(outroot, "out")
check_dir(out_pre_dir)

logdir = path.join(outroot, "logs")
check_dir(logdir)

if os.getcwd() != outroot:
    os.chdir(outroot)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# define the paml command
codeml_cmd = "python /nfs/research2/goldman/pomeranz/tree_stats/Scripts/analyse_codeml.py --alignfile {0} --treefile {1} --template_dir {2} --outfile {3} --workdir {4}"
# start the loop
示例#52
0
文件: main.py 项目: kgarg8/optim-meta
    parser.add_argument('--outer_lr', type=float, default=1e-3)
    parser.add_argument('--outer_opt', type=str, default='Adam')
    parser.add_argument('--lr_sched',
                        type=lambda x: (str(x).lower() == 'true'),
                        default=False)
    # network settings
    parser.add_argument('--net', type=str, default='ConvNet')
    parser.add_argument('--n_conv', type=int, default=4)
    parser.add_argument('--n_dense', type=int, default=0)
    parser.add_argument('--hidden_dim', type=int, default=64)
    parser.add_argument('--in_channels', type=int, default=3)
    parser.add_argument(
        '--hidden_channels',
        type=int,
        default=64,
        help='Number of channels for each convolutional layer (default: 64).')
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    set_seed(args.seed)
    set_gpu(args.device)
    check_dir(args)
    set_logger(os.path.join('logs/', args.exp + '.txt'), log_console=False)
    t1_start = process_time()
    main(args)
    t1_stop = process_time()
    logging.info('Elapsed time = {}'.format(t1_stop - t1_start))
示例#53
0
ens_cdna_dir = args.cds
for f in glob(path.join(ens_cdna_dir, '*.fa')):
    print "Processing", f
    for seqr in SeqIO.parse(f, 'fasta'):
        if seqr.id in ens_map:
            print "Duplicate id", seqr.id
            sys.exit(-1)

        ens_map[t2p[seqr.id]] = seqr.seq

clades_pickle = args.species_cache
clades = pickle.load(open(clades_pickle))

inroot = args.inroot
outroot = args.outroot
utils.check_dir(outroot)

for seqset in glob(path.join(inroot, args.clade, "*", "*.tab")):
    setid = path.basename(seqset).rpartition('.')[0]
    seqs = []
    utils.check_dir(path.join(outroot, args.clade))

    for l in open(seqset):
        seqid, species = l.rstrip().split('\t')
        # if species not in all_species:
        #     continue

        # TODO The completely honest thing to do would be to check if the genes
        # with missing sequences fall into the relevant clade or not
        seq = ens_map.get(seqid)
        if seq is None:
示例#54
0
            data[item] = 0
        data.loc[:, [comConfig.col_has_link]] = data[comConfig.col_content].map(lambda x: whe_link(x))
        data.loc[:, [comConfig.col_has_title]] = data[comConfig.col_content].map(lambda x: whe_title(x))
        data.loc[:, [comConfig.col_has_emoj]] = data[comConfig.col_content].map(lambda x: whe_emoji(x))
        data.loc[:, [comConfig.col_has_at]] = data[comConfig.col_content].map(lambda x: whe_art(x))
        data.loc[:, [comConfig.col_text_len]] = data[comConfig.col_content].map(lambda x: get_Length(x))
        return data


if __name__ == '__main__':
    fe = feature_extraction()
    train_data = fe.read_train_data(fileConfig.csv_dir + fileConfig.file_train_pandas)
    # predict_data = fe.read_predict_data(fileConfig.data_dir + fileConfig.file_weibo_predict_data)
    test_data = fe.read_test_data(fileConfig.csv_dir + fileConfig.file_test_pandas)
    user_dict = fe.get_Dict(train_data)
    mblog_dict = utils.pickle_load(fileConfig.pickle_dir + fileConfig.file_train_mblog_dict_pkl)
    # print(dict)
    print("start create train feature...")
    train_data_updated = fe.build_feature(train_data, user_dict, mblog_dict)
    print("start create test feature...")
    # predict_data_updated = fe.build_feature(predict_data, dict)
    test_data_updated = fe.build_feature(test_data, user_dict, mblog_dict)
    # dict_dataframe=pd.DataFrame(dict).T
    # dict_dataframe.columns=['总数量','总转发','总评论','总赞']
    # dict_dataframe.to_csv('dict_pandas.csv')

    utils.check_dir(fileConfig.csv_dir)
    train_data_updated.to_csv(fileConfig.csv_dir + fileConfig.file_fe_train)
    # predict_data_updated.to_csv(fileConfig.csv_dir + fileConfig.file_fe_predict)
    test_data_updated.to_csv(fileConfig.csv_dir + fileConfig.file_fe_test)
示例#55
0
argparser.add_argument('--mode', metavar='mode', type=str, required=False, default='codon')
argparser.add_argument('--rerun', action='store_true')

args = argparser.parse_args()

if args.treeroot:
    prank_cmd = "prank -d={0} -t={1} -o={2} -prunetree -" + args.mode
else:
    prank_cmd = "prank -d={0} -o={1} -prunetree -codon" + args.mode

inroot = args.inroot
treeroot = args.treeroot
alndir = args.outroot
logroot = args.logdir

utils.check_dir(logroot)
utils.check_dir(path.join(logroot, args.clade))
utils.check_dir(alndir)
utils.check_dir(path.join(alndir, args.clade))

for infile in glob.glob(path.join(inroot, args.clade, "*", "*.fa")):
    print infile
    basename = path.basename(infile).partition('.')[0]
    prefix = basename.partition('_')[0][:2]

    outdir = path.join(alndir, args.clade, prefix)
    utils.check_dir(outdir)
    outfile = path.join(outdir, basename + '_prank')

    logdir = path.join(logroot, args.clade, prefix)
示例#56
0
def format_trees(treeroot, fastaroot, outroot):
    
    fastafiles = path.join(fastaroot, "*/*.fa")
    
    if not os.path.exists(outroot):
        os.makedirs(outroot)
    
    rooted_out_dir = path.join(outroot, "rooted")
    check_dir(rooted_out_dir)
    unrooted_out_dir = path.join(outroot, "unrooted")
    check_dir(unrooted_out_dir)
    
    
    for infile in glob(fastafiles):
        
        print infile
        
        basename = path.basename(infile).partition('.')[0]
        basename = "".join(basename.split("_")[0] + "_" + basename.split("_")[1])
        prefix = basename.partition('_')[0][:2]
        
        fastafile = infile 
        treedir = path.join(treeroot, prefix)
        treefile = path.join(treedir, basename + '.nh')
        
        # make the tree object
        tree = Tree(newick=treefile)
        
        # loop that deletes nodes that are not in the alignment
        for leaf_name in tree.iter_leaf_names():
            
            name_check = []
            
            for ID in SeqIO.parse(fastafile, "fasta"):
                if ID.id in leaf_name:
                    name_check.append(True)
                else:
                    name_check.append(False)
            
            if any(name_check):
                continue
            else:
                leaf = tree.search_nodes(name=leaf_name)[0]
                leaf.delete()
                #node = leaf.up
                #node.remove_child(leaf)
                    
            # create the directories for rooted trees
            rooted_out_sub_dir = path.join(rooted_out_dir, prefix)
            check_dir(rooted_out_sub_dir)
            rooted_out_file = path.join(rooted_out_sub_dir, basename + ".nh")
            
            
            
            tree.write(outfile=rooted_out_file, format=6)
            
            # create subdirectories for unrooted trees
            unrooted_out_sub_dir = path.join(unrooted_out_dir, prefix)
            check_dir(unrooted_out_sub_dir)
            unrooted_out_file = path.join(unrooted_out_sub_dir, basename + ".nh")
            # unroot the tree
            tree.unroot()
            
            tree.write(outfile=unrooted_out_file, format=6)
示例#57
0
                output_results.append(output_text)
                if _ % 100 == 0 and i == 0:
                    print('====================')
                    input_text = decode_text(in_seq[i],
                                             self.valid_data_flow.vocabs)
                    print('src:' + input_text)
                    print('output: ' + ' '.join(output_text))
                    print('target: ' + ' '.join(target_text))
        return bleu.compute_bleu(target_results, output_results)[0] * 100


if __name__ == '__main__':

    pid = os.getpid()

    if not check_dir(dataflow.lstm_save_dir):
        create_dir(dataflow.lstm_save_dir)

    print('loading training data...')
    train_data_flow = dataflow.DataFlow(dataflow.batch_size,
                                        data_dir=dataflow.data_path + 'train/')
    print('loading evaluation data...')
    valid_data_flow = dataflow.DataFlow(dataflow.batch_size,
                                        data_dir=dataflow.data_path + 'test/',
                                        shuffle=True)

    q = queue.Queue(maxsize=100)

    pt = Producter(queue=q, data_loader=train_data_flow, step=maxstep)
    ce = Consumer(step=maxstep,
                  queue=q,
def print_file(arg_namespace):
    _arg = arg_namespace

    printer = print_json
    encoding = _arg.encoding

    target = _arg.target
    dataType = _arg.dataType
    start = _arg.start
    end = _arg.end if _arg.end else start
    filename_time = _arg.filename_time
    filetype = 'json'

    interval_time = _arg.interval_time
    if target=='local':
        localType = _arg.localType
    else:
        localType = None

    if _arg.directory:
        datadir = _arg.directory
    else:
        if (target=='local' and localType):
            datadir = './crawled_data/%s-%s/%s' % (target, localType, dataType)
        else:
            datadir = './crawled_data/%s/%s' % (target, dataType)

    time_string = datetime.today().strftime("%Y%m%d%H%M%S")
    check_dir(datadir)

    jobs = []
    if target=='local':
        if filename_time:
            for n in range(start, end+1):
                filename = '%s/%s-%s-%s-%d-%s.%s'\
                    % (datadir, target, localType, dataType, n, time_string, filetype)
                job = gevent.spawn(crawl, target=target, localType=localType,\
                    _dataType=dataType, nth=n, filename=filename, encoding=encoding, printer=printer)
                jobs.append(job)
        else:
            for n in range(start, end+1):
                filename = '%s/%s-%s-%s-%d.%s'\
                    % (datadir, target, localType, dataType, n, filetype)
                job = gevent.spawn(crawl, target=target, localType=localType,\
                    _dataType=dataType, nth=n, filename=filename, encoding=encoding, printer=printer)
                jobs.append(job)

    else:
        if filename_time:
            for n in range(start, end+1):
                filename = '%s/%s-%s-%d-%s.%s'\
                        % (datadir, target, dataType, n, time_string, filetype)
                job = gevent.spawn(crawl, target=target, _dataType=dataType, nth=n,\
                        filename=filename, encoding=encoding, printer=printer)
                jobs.append(job)
        else:
            for n in range(start, end+1):
                filename = '%s/%s-%s-%d.%s'\
                        % (datadir, target, dataType, n, filetype)
                job = gevent.spawn(crawl, target=target, _dataType=dataType, nth=n,\
                        filename=filename, encoding=encoding, printer=printer)
                jobs.append(job)

    gevent.joinall(jobs)
    print('Data written to %s' % filename)

    if interval_time!=0 and interval_time!=None:
        s = sched.scheduler(time.time, time.sleep)
        print('The program will crawl the next data within %d seconds.' % interval_time)
        s.enter(interval_time, 1, print_file, kwargs=dict(arg_namespace=_arg))
        s.run()