Exemplo n.º 1
0
def assignHImass(aa, save=False):
    '''assign HI masses to halos'''
    zz = atoz(aa)
    if rank == 0: print('Redshift = %0.2f' % zz)

    halocat = readincatalog(aa)
    hmass = halocat['Mass'].compute()
    hpos = halocat['Position'].compute()
    #Do hod
    ofolder = myscratch + '/%s/fastpm_%0.4f/' % (sim, aa)
    try:
        os.make_dirs(ofolder)
    except:
        pass
    h1mass = HI_hod(hmass, aa)
    halocat['HImass'] = h1mass

    if save:
        colsave = [cols for cols in halocat.columns]
        colsave = ['ID', 'Position', 'Mass', 'HImass']
        if rank == 0: print(colsave)
        halocat.save(ofolder + 'halocat', colsave)
        if rank == 0: print('Halos saved at path\n%s' % ofolder)

    return halocat
Exemplo n.º 2
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    for task_name in FLAGS.tasks:
        logging.info('Starting task: %s.', task_name)
        df = probing_utils.read_senteval_data(FLAGS.senteval_path, task_name)

        test_indices_list = probing_utils.split_with_wasserstein(
            df['text'],
            test_set_size=FLAGS.test_set_size,
            no_of_trials=FLAGS.trial_count,
            min_df=FLAGS.feature_vector_min_df,
            leaf_size=FLAGS.nn_leaf_size)

        experiment_base_dir = os.path.join(FLAGS.base_out_dir,
                                           FLAGS.split_name) + '-{}'

        for trial_id in range(FLAGS.trial_count):
            split_dir = experiment_base_dir.format(trial_id)
            probing_dir = os.path.join(split_dir, 'probing')
            settings_path = os.path.join(probing_dir,
                                         '{}-settings.json'.format(task_name))
            data_out_path = os.path.join(probing_dir, '{}'.format(task_name))
            logging.info('Starting run: %d.', trial_id)
            test_indices = test_indices_list[trial_id]

            all_indices = set(range(len(df)))
            train_dev_indices = all_indices - set(test_indices)
            dev_indices = random.sample(train_dev_indices, FLAGS.dev_set_size)
            train_indices = list(train_dev_indices - set(dev_indices))

            logging.info('Writing output to file: %s.', data_out_path)

            # Set new labels.
            df.loc[df.index[train_indices], 'set'] = 'tr'
            df.loc[df.index[dev_indices], 'set'] = 'va'
            df.loc[df.index[test_indices], 'set'] = 'te'

            os.make_dirs(probing_dir)

            with open(settings_path, 'w') as settings_file:
                settings = {
                    'task_name': task_name,
                    'trial_id': trial_id,
                    'train_size': len(train_indices),
                    'dev_size': len(dev_indices),
                    'test_size': len(test_indices),
                }
                logging.info('Settings:\n%r', settings)
                json.dump(settings, settings_file, indent=2)

            with open(data_out_path, 'w') as data_file:
                # Don't add quoting to retain the original format unaltered.
                df[['set', 'target', 'text']].to_csv(data_file,
                                                     sep='\t',
                                                     header=False,
                                                     index=False,
                                                     quoting=csv.QUOTE_NONE,
                                                     doublequote=False)
Exemplo n.º 3
0
 def __init__(self, outputDirName, scale=1.0):
     PDFHandler.__init__(self)
     self.outputDirName = outputDirName
     try:
         os.make_dirs(os.path.join(self.outputDirName, 'images'))
     except OSError:
         pass
     self.scale = scale
def main(argv):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  for task_name in FLAGS.tasks:
    logging.info('Starting task: %s.', task_name)
    df = probing_utils.read_senteval_data(FLAGS.senteval_path, task_name)
    df['text_len'] = df['text'].str.split().apply(len)

    # There is only 1 trial. Therefore, in order to stay consistent in directory
    # names we manually add '0' here.
    split_dir = os.path.join(FLAGS.base_out_dir, FLAGS.split_name) + '-0'
    probing_dir = os.path.join(split_dir, 'probing')
    settings_path = os.path.join(probing_dir,
                                 '{}-settings.json'.format(task_name))
    data_out_path = os.path.join(probing_dir, '{}'.format(task_name))

    length_threshold, test_lengths, test_mask = (
        probing_utils.split_by_length_threshold(df, FLAGS.test_set_size))
    train_dev_idxs = np.nonzero(~test_mask)[0]
    assert len(train_dev_idxs) > FLAGS.dev_set_size, 'Dev set size too large.'
    dev_idxs = random.sample(train_dev_idxs.tolist(), FLAGS.dev_set_size)
    train_idxs = list(set(train_dev_idxs) - set(dev_idxs))

    logging.info('Writing output to file: %s.', data_out_path)

    # Set new labels.
    df.loc[test_mask, 'set'] = 'te'
    df.loc[df.index[train_idxs], 'set'] = 'tr'
    df.loc[df.index[dev_idxs], 'set'] = 'va'

    os.make_dirs(probing_dir)

    with open(settings_path, 'w') as settings_file:
      settings = {
          'task_name': task_name,
          'length_threshold': length_threshold,
          'all_lengths': sorted(set(df['text_len'])),
          'test_lengths': sorted(test_lengths),
          'train/dev_lengths': sorted(set(df['text_len']) - set(test_lengths)),
          'train_size': len(train_idxs),
          'dev_size': len(dev_idxs),
          'test_size': int(test_mask.sum()),
          'test_mask': test_mask.values.tolist(),
      }
      logging.info('Settings:\n%r', settings)
      json.dump(settings, settings_file, indent=2)

    with open(data_out_path, 'w') as data_file:
      # Don't add quoting to retain the original format unaltered.
      df[['set', 'target', 'text']].to_csv(
          data_file,
          sep='\t',
          header=False,
          index=False,
          quoting=csv.QUOTE_NONE,
          doublequote=False)
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    for task_name in FLAGS.tasks:
        logging.info('Starting task: %s.', task_name)
        df = probing_utils.read_senteval_data(FLAGS.senteval_path, task_name)

        experiment_base_dir = os.path.join(
            FLAGS.base_out_dir,
            '{}{}'.format(FLAGS.num_folds, FLAGS.split_name) + '-{}')
        skf = model_selection.StratifiedKFold(n_splits=FLAGS.num_folds)

        for current_fold_id, (train_indexes, test_indexes) in enumerate(
                skf.split(df['text'], df['target'])):
            split_dir = experiment_base_dir.format(current_fold_id)
            probing_dir = os.path.join(split_dir, 'probing')
            settings_path = os.path.join(probing_dir,
                                         '{}-settings.json'.format(task_name))
            data_out_path = os.path.join(probing_dir, '{}'.format(task_name))
            logging.info('Starting run: %d.', current_fold_id)

            # Use the same data for train and dev, because the probing code does some
            # hyperparameter search on dev. We don't wanna tune on the test portion.
            train_set = df.iloc[train_indexes].copy()
            train_set.loc[:, 'set'] = 'tr'
            dev_set = df.iloc[train_indexes].copy()
            dev_set.loc[:, 'set'] = 'va'
            test_set = df.iloc[test_indexes].copy()
            test_set.loc[:, 'set'] = 'te'
            new_data = pd.concat([train_set, dev_set, test_set],
                                 ignore_index=True)

            logging.info('Writing output to file: %s.', data_out_path)
            os.make_dirs(probing_dir)

            with open(settings_path, 'w') as settings_file:
                settings = {
                    'task_name': task_name,
                    'fold_id': current_fold_id,
                    'train_size': len(train_indexes),
                    'dev_size': len(train_indexes),
                    'test_size': len(test_indexes),
                }
                logging.info('Settings:\n%r', settings)
                json.dump(settings, settings_file, indent=2)

            with open(data_out_path, 'w') as data_file:
                # Don't add quoting to retain the original format unaltered.
                new_data[['set', 'target',
                          'text']].to_csv(data_file,
                                          sep='\t',
                                          header=False,
                                          index=False,
                                          quoting=csv.QUOTE_NONE,
                                          doublequote=False)
Exemplo n.º 6
0
 def download(self, url, save_directory):
     file_name = url.split('/')[-1]
     if not os.path.exists(save_directory):
         os.make_dirs(save_directory)
     save_location = os.path.join(save_directory, file_name)
     r = self.session.get(url, stream=True)
     with open(save_location, 'wb') as f:
         for chunk in r.iter_content(chunk_size=1024): 
             if chunk:
                 f.write(chunk)
Exemplo n.º 7
0
    def create_qr_codes(self, output_dir, overwrite=False):
        # go through all the houses/contacts
        # if the qr month isn't populated, create a qr code pointing to the desired link and with the desired filename
        # save the code in the output_dir
        # update the qr month
        try:
            os.make_dirs(output_dir)
        except:
            pass
        raw_cmd = 'qrencode -o "{}/{}" "{}"'
        date_string = datetime.datetime.now().date().strftime('%Y%m')
        sheet_data = self.sheets_data[sc.sheet_names['houses']]
        for i, row in enumerate(sheet_data):
            if i == 0: continue
            if overwrite or not row[sc.house_data_header.index(sc.qr_date_column_name)]:
                cmd = raw_cmd.format(output_dir,
                        row[sc.house_data_header.index(sc.qr_file_name_column_name)],
                        row[sc.house_data_header.index(sc.unique_url_column_name)])
                rc = os.system(cmd)
                if rc != 0:
                    pdb.set_trace()
                    print('Error creating qr code with filename {}'.format(row[sc.house_data_header.index(sc.qr_file_name_column_name)]))
                    continue
                j = sheet_data[0].index(sc.qr_date_column_name)
                self.update_cell(i, j, date_string, sc.sheet_names['houses'])
# TODO: import the house/contact data named tuple instead of indexing into the header
# Contacts
        sheet_data = self.sheets_data[sc.sheet_names['contacts']]
        for i, row in enumerate(sheet_data):
            if i == 0: continue
            # at least address/email/links need to be populated
            if (overwrite or not row[sc.contact_data_header.index(sc.qr_date_column_name)]) \
                        and \
                        (row[sc.contact_data_header.index(sc.address_column_name)] or row[sc.contact_data_header.index(sc.email_column_name)] or row[sc.contact_data_header.index(sc.links_column_name)] or row[sc.contact_data_header.index(sc.name_column_name)]):
                cmd = raw_cmd.format(output_dir,
                        row[sc.contact_data_header.index(sc.qr_file_name_column_name)],
                        row[sc.contact_data_header.index(sc.unique_url_column_name)])
                rc = os.system(cmd)
                if rc != 0:
                    pdb.set_trace()
                    print('Error creating qr code with filename {}'.format(row[sc.contact_data_header.index(sc.qr_file_name_column_name)]))
                    continue
                j = sheet_data[0].index(sc.qr_date_column_name)
                self.update_cell(i, j, date_string, sc.sheet_names['contacts'])
Exemplo n.º 8
0
def savemesh(aa, bs=bs, nc=nc, kmin=0.1, R=1, savecat=False):
    '''save HI mesh with modes removed as in the function "removemodes"
    '''

    pm = ParticleMesh(BoxSize=bs, Nmesh=[nc, nc, nc])
    hcat = assignHImass(aa, save=savecat)
    #mesh = pm.paint(hcat['Position'], mass=hcat['HImass'])
    mesh = hcat.to_mesh(BoxSize=bs,
                        Nmesh=[nc, nc, nc],
                        position='Position',
                        value='HImass').to_real_field()
    meshG = mesh.r2c().apply(
        lambda k, v: v * np.exp(-sum(ki**2 for ki in k) * R**2)).c2r()
    path = myscratch + sim + '/fastpm_%0.4f/' % aa + '/HImesh_N%04d' % (nc)
    try:
        os.make_dirs(path)
    except:
        pass
    if rank == 0: print('Save mesh to path \n', path)
    meshkpar, meshkparG = removemodes(mesh, kmin, R)

    mesh = FieldMesh(mesh)
    mesh.save(path, dataset='HI', mode='real')
    meshG = FieldMesh(meshG)
    meshG.save(path,
               dataset='HI-R%dp%d' % (int(R), (R * 10) % 10),
               mode='real')

    meshkpar = FieldMesh(meshkpar)
    dataset = 'kpar%dp%d' % (int(kmin), kmin * 10)  #
    meshkpar.save(path, dataset=dataset, mode='real')

    meshkparG = FieldMesh(meshkparG)
    dataset = 'kpar%dp%d-R%dp%d' % (int(kmin), (kmin * 10) % 10, int(R),
                                    (R * 10) % 10)  #
    meshkparG.save(path, dataset=dataset, mode='real')
import os
import pandas as pd
import random

in_dir = '../larger-processed/'
out_dir = '../larger-processed/'

try:
    os.make_dirs(out_dir)
except:
    pass

checks = pd.read_csv(open(in_dir +
                          'large-interventions-2015-12-20-checks.csv'))
interventions = pd.read_csv(
    open(in_dir + 'large-interventions-2015-12-20-interventions.csv'))

num = len(str(max(checks['intervention_num'])))

ids = checks['coin_name'].map(str) + '-' + checks['intervention_num'].map(
    lambda x: str(x).zfill(num))
checks['ids'] = ids

ids = interventions['coin_name'].map(str) + '-' + interventions[
    'intervention_num'].map(lambda x: str(x).zfill(num))
interventions['ids'] = ids

ids = list(set(checks['ids']))

random.shuffle(ids)
Exemplo n.º 10
0
def run_fasttext(args) -> None:
    """
    Function to run the fasttext classifier with the given command line
    arguments.

    Parameters
    ----------
    args:
        The NameSpace of command line arguments generated by the fasttext
        argument parser
    """
    if not path_exists(abs_path(args.train_file)):
        raise ValueError("Invalid input train file path given!")
    elif args.test_file and not path_exists(abs_path(args.test_file)):
        raise ValueError("Invalid input test file path given!")
    elif args.predict_test_split and not args.test_file:
        raise ValueError("Cannot predict on test_file without specifying its \
                         path!")

    train_file_name = abs_path(args.train_file)
    classifier = ft.supervised(train_file_name, args.input_binary)

    output_file = None
    if args.metrics:
        from sys import stdout
        output_file = stdout
    else:
        if not path_exists(abs_path(args.output_dir)):
            make_dirs(abs_path(args.output_dir))
        output_file = open(
            abs_path(path_join(args.output_dir, args.metrics_file)), "w"
        )

    if args.test_file:
        test_file_name = abs_path(args.test_file)
        results = classifier.test(test_file_name)
        print("Number of test slices:", results.nexamples, file=output_file)
        print("Precision:", results.precision, file=output_file)
        print("Recall:", results.recall, file=output_file)
        print(file=output_file)

    if args.predict_strip or args.predict_test_split or args.predict_test_texts:
        print("Prediction results:\n", file=output_file)

    if args.predict_strip:
        print(fancy_prediction(classifier, args.predict_strip),
              file=output_file)
    elif args.predict_test_split and args.test_file:
        with open(test_file_name) as f:
            for line in f.read().splitlines():
                print(
                    fancy_prediction(
                        classifier,
                        ' '.join(line.split()[1:])
                    ), '\n',
                    sep='',
                    file=output_file
                )
    elif args.predict_test_texts:
        with open(args.predict_test_texts) as f:
            for line in f.read().splitlines()[1:]:
                filename, label, tokens = line.split(',')
                print(filename, ':', sep='', file=output_file)
                print(fancy_prediction(classifier, tokens), '\n', sep='',
                      file=output_file
                      )
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    word_content_data = probing_utils.read_senteval_data(
        FLAGS.senteval_path, _TASK_NAME)
    all_sentences = list(
        probing_utils.get_strings_from_sharded_recordio(FLAGS.gutenberg_path))

    train_set = word_content_data.loc[word_content_data['set'] == 'tr', :]
    dev_set = word_content_data.loc[word_content_data['set'] == 'va', :]

    experiment_base_dir = os.path.join(FLAGS.base_out_dir,
                                       f'{FLAGS.split_name}-{{}}')

    for trial_id in range(FLAGS.num_trials):
        split_dir = experiment_base_dir.format(trial_id)
        probing_dir = os.path.join(split_dir, 'probing')
        settings_path = os.path.join(probing_dir,
                                     f'{_TASK_NAME}-settings.json')
        data_out_path = os.path.join(probing_dir, _TASK_NAME)

        logging.info('Starting run: %d.', trial_id)

        sampled_sentences = random.sample(all_sentences, FLAGS.test_set_size)
        data_sample = []

        for sentence in sampled_sentences:

            # Swap bigrams of 50% of the sentences.
            if random.randint(0, 1):
                tokens = sentence.split()
                index_to_swap = random.randint(0, len(tokens) - 2)
                tmp = tokens[index_to_swap + 1]
                tokens[index_to_swap + 1] = tokens[index_to_swap]
                tokens[index_to_swap] = tmp
                sentence = ' '.join(tokens)
                target = 'I'
            else:
                target = 'O'

            data_sample.append(('te', target, sentence))

        test_set = pd.DataFrame(data_sample, columns=train_set.columns)
        new_data = pd.concat([train_set, dev_set, test_set], ignore_index=True)

        logging.info('Writing output to file: %s.', data_out_path)
        os.make_dirs(probing_dir)

        with open(settings_path, 'w') as settings_file:
            settings = {
                'task_name': _TASK_NAME,
                'trial_id': trial_id,
                'train_size': len(train_set),
                'dev_size': len(dev_set),
                'test_size': len(test_set),
            }
            logging.info('Settings:\n%r', settings)
            json.dump(settings, settings_file, indent=2)

        with open(data_out_path, 'w') as data_file:
            # Don't add quoting to retain the original format unaltered.
            new_data[['set', 'target', 'text']].to_csv(data_file,
                                                       sep='\t',
                                                       header=False,
                                                       index=False,
                                                       quoting=csv.QUOTE_NONE,
                                                       doublequote=False)
Exemplo n.º 12
0
#!/usr/bin/env python
# coding: utf-8

# In[12]:
import os
get_ipython().run_line_magic('run', 'embeddings.ipynb')
get_ipython().run_line_magic('run', 'dataset.ipynb')
get_ipython().run_line_magic('run', 'label_generator.ipynb')


# **Parse raw data into different folders**

# In[23]:
if not os.path.exists: os.make_dirs('../data/parsed_data/cnn/')
if not os.path.exists: os.make_dirs('../data/parsed_data/dailymail')

parse_all('../data/neuralsum/cnn/', '../data/parsed_data/cnn/')
parse_all('../data/neuralsum/dailymail/', '../data/parsed_data/dailymail/')


# **Convert parsed training documents to embedding indices**

# In[24]:
if not os.path.exists: os.make_dirs('../data/processed/cnn/')
if not os.path.exists: os.make_dirs('../data/processed/dailymail')

data_converter = NeuralSumToEmbedding('/home/ramkishore.s/meta/glove/glove.6B.200d.txt')
data_converter.root_convert('../data/parsed_data/cnn/', '../data/processed/cnn/')
data_converter.root_convert('../data/parsed_data/dailymail/', '../data/processed/dailymail/')

Exemplo n.º 13
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    for task_name in FLAGS.tasks:
        logging.info('Starting task: %s.', task_name)
        df = probing_utils.read_senteval_data(FLAGS.senteval_path, task_name)
        df['text_len'] = df['text'].str.split().apply(len)
        experiment_base_dir = os.path.join(FLAGS.base_out_dir,
                                           FLAGS.split_name) + '-{}'

        for trial_id in range(FLAGS.trial_count):
            split_dir = experiment_base_dir.format(trial_id)
            probing_dir = os.path.join(split_dir, 'probing')
            settings_path = os.path.join(probing_dir,
                                         '{}-settings.json'.format(task_name))
            data_out_path = os.path.join(probing_dir, '{}'.format(task_name))
            logging.info('Starting run: %d.', trial_id)

            test_lengths, test_mask = probing_utils.split_by_random_length(
                df, FLAGS.test_set_size)
            train_dev_indexes = np.nonzero(~test_mask)[0]
            dev_indexes = random.sample(train_dev_indexes.tolist(),
                                        FLAGS.dev_set_size)
            train_indexes = list(set(train_dev_indexes) - set(dev_indexes))

            logging.info('Writing output to file: %s.', data_out_path)

            # Set new labels.
            df.loc[test_mask, 'set'] = 'te'
            df.loc[df.index[train_indexes], 'set'] = 'tr'
            df.loc[df.index[dev_indexes], 'set'] = 'va'

            os.make_dirs(probing_dir)

            with open(settings_path, 'w') as settings_file:
                settings = {
                    'task_name':
                    task_name,
                    'trial_id':
                    trial_id,
                    'all_lengths':
                    sorted(set(df['text_len'])),
                    'test_lengths':
                    sorted(test_lengths),
                    'train/dev_lengths':
                    sorted(set(df['text_len']) - set(test_lengths)),
                    'train_size':
                    len(train_indexes),
                    'dev_size':
                    len(dev_indexes),
                    'test_size':
                    int(test_mask.sum()),
                    'test_mask':
                    test_mask.values.tolist(),
                }
                logging.info('Settings:\n%r', settings)
                json.dump(settings, settings_file, indent=2)

            with open(data_out_path, 'w') as data_file:
                # Don't add quoting to retain the original format unaltered.
                df[['set', 'target', 'text']].to_csv(data_file,
                                                     sep='\t',
                                                     header=False,
                                                     index=False,
                                                     quoting=csv.QUOTE_NONE,
                                                     doublequote=False)
Exemplo n.º 14
0
    print("\tgunzipping", fastqfile)
    print( run_timed( [ "gunzip", "-q", fastqfile] ) )

print("\n>>> GUNZIP: unzipping _R2.fastq.gz files:")
for fastqfile in samples2:
    print("\tgunzipping", fastqfile)
    print( run_timed( [ "gunzip", "-q", fastqfile] ) )

# strip the .gz here?
unzipped1 = [f.replace('.gz', '') for f in samples1]
unzipped2 = [f.replace('.gz', '') for f in samples2]

# TRIMMOMATIC to remove unwanted sequences
# FASTQC to determine quality
print("\n>>> TRIMMOMATIC: Trimming excess sequences from .fastq file")
os.make_dirs("%s01_trimmomatic" % outputdir, exit_ok=True)

# FASTQC to determine quality
print("\n>>> FASTQC: analyzing quality of each .fastq file")
os.make_dirs( os.path.join(outputdir , "02_fastqc" ), exist_ok=True)

for samplename, unzippedfile1, unzippedfile2 in zip(names, unzipped1, unzipped2):

    #Make output directories
    sample_1_outputdir = os.path.join( outputdir , "02_fastqc/" , samplename + "_1" )
    sample_2_outputdir = os.path.join( outputdir , "02_fastqc/" , samplename + "_2" )

    inputfile = os.path.join(inputdir, unzippedfile1)

    # R1
    cmd1_args = [ 'fastqc', '-o', sample_1_outputdir, "-t 20", inputfile ]
Exemplo n.º 15
0
cc = "g++"
#scheme_src = """
#zengine/scheme/bignum.c
#zengine/scheme/eval.c
#zengine/scheme/gc.c
#zengine/scheme/opcodes.c
#zengine/scheme/sexp.c
#zengine/scheme/vm.c
#zengine/scheme/include/chibi/sexp-hufftabs.c
#"""
scheme_src = ""

############################
#create tempdir
if not os.path.exists(temp_dir):
    os.make_dirs(temp_dir)

#########################################
#split line
cpp_files = []
for one_dir in dirs:
    cpp_files = cpp_files + [
        one_dir + "/" + cpp for cpp in os.listdir(one_dir)
        if cpp.endswith('.cpp') or cpp.endswith('.c') or cpp.endswith('.cc')
    ]

cpp_files = cpp_files + scheme_src.split()

get_temp_o = lambda cpp: temp_dir + "/" + cpp.replace("./", "").replace(
    ".cpp", ".o").replace(".cc", ".o").replace(".c", ".o").replace("/", ".")
"""
cosmic_gene_list
cosmic_raw_data_folder - remember to add slash at the end
"""

cosmic_gene_list = sys.argv[1]
base_folder = global_stuff.base_folder
cosmic_raw_data_folder = sys.argv[2]

f = open(cosmic_gene_list, 'r')

for line in f:
    gene = line.strip()
    print gene
    folder = base_folder + gene + '/'
    try:
        os.makedirs(folder)
    except Exception, err:
        print err
    # find the sequence in cosmic downloaded seq folder
    seq_folder = cosmic_raw_data_folder + gene[0].upper() + '/'
    try:
        os.make_dirs(seq_folder)
    except Exception, err:
        pass
    seq_file = seq_folder + gene + '_protein.txt'
    new_seq_file = folder + 'seq'
    subprocess.call(['cp', seq_file, new_seq_file])

f.close()
Exemplo n.º 17
0
# Python program to convert
# text file to pdf file

from fpdf import FPDF
import os
"""import glob

# file name in array
file_path = glob.glob("./code_file/*")
"""
# save FPDF() class into
# a variable pdf
file_path = []
if not os.path.exists('code_file'):
    os.make_dirs('code_file')
for dir in os.listdir('code_file'):
    file = './code_file/' + dir
    file_path.append(file)
pdf = FPDF()
#print(file_path)
# Add a page
pdf.add_page()

# set style and size of font
# that you want in the pdf
pdf.set_font("Arial", size=15)

# open the text file in read mode
for f in file_path:
    f = open(f, "r")
Exemplo n.º 18
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    word_content_data = probing_utils.read_senteval_data(
        FLAGS.senteval_path, _TASK_NAME)
    target_words = set(word_content_data['target'])
    senteval_sentences = set(word_content_data['text'])
    target_word_to_sentences = probing_utils.get_target_word_to_sentence_mapping(
        target_words, senteval_sentences,
        probing_utils.get_strings_from_sharded_recordio(FLAGS.gutenberg_path))
    class_count = len(target_words)
    items_per_class = FLAGS.test_set_size // class_count
    target_word_counts = {
        k: len(v)
        for k, v in target_word_to_sentences.items()
    }
    valid_target_words = {
        k
        for k, v in target_word_counts.items() if v >= items_per_class
    }
    logging.info('Number of target words for which we have enough data: %d.',
                 len(valid_target_words))
    assert valid_target_words

    train_set = word_content_data.loc[word_content_data['set'] == 'tr', :]
    dev_set = word_content_data.loc[word_content_data['set'] == 'va', :]

    experiment_base_dir = os.path.join(FLAGS.base_out_dir,
                                       f'{FLAGS.split_name}-{{}}')

    for trial_id in range(FLAGS.num_trials):
        split_dir = experiment_base_dir.format(trial_id)
        probing_dir = os.path.join(split_dir, 'probing')
        settings_path = os.path.join(probing_dir,
                                     f'{_TASK_NAME}-settings.json')
        data_out_path = os.path.join(probing_dir, _TASK_NAME)

        logging.info('Starting run: %d.', trial_id)

        data_sample = []

        for valid_target_word in valid_target_words:
            sentences = target_word_to_sentences[valid_target_word]
            current_sample = random.sample(sentences, items_per_class)
            data_sample.extend(
                ('te', valid_target_word, sample) for sample in current_sample)

        test_set = pd.DataFrame(data_sample, columns=train_set.columns)
        new_data = pd.concat([train_set, dev_set, test_set], ignore_index=True)

        logging.info('Writing output to file: %s.', data_out_path)
        os.make_dirs(probing_dir)

        with open(settings_path, 'w') as settings_file:
            settings = {
                'task_name': _TASK_NAME,
                'trial_id': trial_id,
                'train_size': len(train_set),
                'dev_size': len(dev_set),
                'test_size': len(test_set),
                'valid_target_words_size': len(valid_target_words),
                'valid_target_words': sorted(valid_target_words),
            }
            logging.info('Settings:\n%r', settings)
            json.dump(settings, settings_file, indent=2)

        with open(data_out_path, 'w') as data_file:
            # Don't add quoting to retain the original format unaltered.
            new_data[['set', 'target', 'text']].to_csv(data_file,
                                                       sep='\t',
                                                       header=False,
                                                       index=False,
                                                       quoting=csv.QUOTE_NONE,
                                                       doublequote=False)
Exemplo n.º 19
0
def concatenate(enc_dir1, enc_dir2, output_dir):
    def _check_consistency(enc_dir1, enc_dir2):
        mp1 = _get_model_name(enc_dir1)
        mp2 = _get_model_name(enc_dir2)
        if mp1 != mp2:
            raise RuntimeError("Inconsistent model names, %s and %s" %
                               (mp1, mp2))
        et1 = _get_enc_transform(enc_dir1)
        et2 = _get_enc_transform(enc_dir2)
        if et1 != et2:
            raise RuntimeError("Inconsistent encoding transforms, %s and %s" \
                    % (et1, et2))

    def _check_paths(enc_dir):
        paths = [
            pj(enc_dir, "model_path.txt"),
            pj(enc_dir, "encoding_chip_list.txt"),
            pj(enc_dir, "encoding_transform.txt"),
            pj(enc_dir, "encoding.npy")
        ]
        for p in paths:
            if not pe(p):
                raise RuntimeError("Expecting to find path %s but it does not "\
                        "exist" % (p))

    def _get_enc_transform(enc_dir):
        with open(pj(enc_dir, "encoding_transform.txt")) as fp:
            enc_tr = next(fp).strip()
        return enc_tr

    def _get_files(enc_dir):
        files = []
        with open(pj(enc_dir, "encoding_chip_list.txt")) as fp:
            for line in fp:
                files.append(line.strip())
        return files

    def _get_model_name(enc_dir):
        with open(pj(enc_dir, "model_path.txt")) as fp:
            mp = next(fp).strip()
            while mp.endswith("/"):
                mp = mp[:-1]
            name = os.path.basename(mp)
        return name

    def _write_ancillary(enc_dir1, enc_dir2, output_dir):
        shutil.copy(pj(enc_dir1, "encoding_transform.txt"), output_dir)
        shutil.copy(pj(enc_dir1, "model_path.txt"), output_dir)
        with open(pj(output_dir, "original_encodings.txt"), "w") as fp:
            fp.write(enc_dir1 + "\n")
            fp.write(enc_dir2 + "\n")

    def _write_chip_list(enc_dir1, enc_dir2, output_dir):
        files1 = _get_files(enc_dir1)
        files2 = _get_files(enc_dir2)
        files = files1 + files2
        with open(pj(output_dir, "encoding_chip_list.txt"), "w") as fp:
            for f in files:
                fp.write(f + "\n")

    def _write_encs(enc_dir1, enc_dir2, output_dir):
        encs1 = np.load(pj(enc_dir1, "encoding.npy"))
        encs2 = np.load(pj(enc_dir2, "encoding.npy"))
        encs = np.concatenate((encs1, encs2))
        np.save(pj(output_dir, "encoding.npy"), encs)

    _check_paths(enc_dir1)
    _check_paths(enc_dir2)
    _check_consistency(enc_dir1, enc_dir2)
    if not pe(output_dir):
        os.make_dirs(output_dir)
    _write_encs(enc_dir1, enc_dir2, output_dir)
    _write_chip_list(enc_dir1, enc_dir2, output_dir)
    _write_ancillary(enc_dir1, enc_dir2, output_dir)