Exemplo n.º 1
0
def parse_args(ap):
   # parse args
   args = u.parse_args(ap)
   # check arguments
   if (len(set(os.path.basename(i) for i in args.inputs)) != len(args.inputs)):
      ap.error('input file basenames must be unique')
   # absolutize input files
   args.inputs = [os.path.abspath(i) for i in args.inputs]
   # set sortdir if unset
   if (args.sortdir is None):
      args.sortdir = 'tmp'
   # done
   return args
Exemplo n.º 2
0
def parse_args(ap):
    # parse args
    args = u.parse_args(ap)
    # check arguments
    if (len(set(os.path.basename(i)
                for i in args.inputs)) != len(args.inputs)):
        ap.error('input file basenames must be unique')
    # absolutize input files
    args.inputs = [os.path.abspath(i) for i in args.inputs]
    # set sortdir if unset
    if (args.sortdir is None):
        args.sortdir = 'tmp'
    # done
    return args
Exemplo n.º 3
0
      dfs[freq].index = dfs[freq].index.to_period(freq)
      dfs[freq].rename(columns=lambda c: re.sub(r'\$norm$', '', c), inplace=True)
      # 2. Clean up any NANs. We interpolate anything in the middle and change
      # boundary NANs to zero. Note that the boundaries are fairly well
      # outside the study period, so that should have minimal effect.
      dfs[freq].interpolate(method='linear', axis=0, inplace=True)
      dfs[freq].fillna(0, inplace=True)
      # 3. Trim the DataFrames to the study period. This doesn't have any effect,
      # since we trim to each test later, but it saves memory.
      (dfs[freq], _) = dfs[freq].align(eg, axis=0, join='inner')
      assert (dfs[freq].index.equals(eg.index))
   # 4. Build a DataFrame for each disease. This duplicates some vectors, but
   # not enough to be a worry.
   vs = dict()
   for (ob, ts) in sorted(g.truth.items()):
      freq = ts.index.freq.name
      dist = args.distance
      vs[ob] = dfs[freq].select(lambda c: relevant_p(ob, c, dist), axis=1)
      l.info('  %-15s %3d articles' % (ob + ':', len(vs[ob].columns)))
   return vs


### Bootstrap ###

if (__name__ == '__main__'):
   args = u.parse_args(ap)
   args.in_ = getattr(args, 'in')  # foo.in is a syntax error
   u.configure(args.config)
   u.logging_init('expmt')
   main()
Exemplo n.º 4
0
        clf_times = []
        for size in sizes:
            clf_times.append('%.5f' % time_it(clf, tweets[:size]))
        l.info(clf.__module__[-15:] + '\t' + '\t'.join(clf_times))


def read_tsvs(filenames):
    tweets = []
    ct = 0
    for filename in filenames:
        reader = tweet.Reader(filename)
        for tw in reader:
            tweets.append(tw.text)
            ct += 1
            if ct > args.test_size:
                return tweets
    return tweets


### Bootstrap ###

try:
    args = u.parse_args(ap)
    u.logging_init('clsbmk')

    if (__name__ == '__main__'):
        main()

except testable.Unittests_Only_Exception:
    testable.register('')