def find_start_line_lda_predictions(predictions_file, num_topics): """ Return the line number (zero indexed) of the start of the last set of predictions in predictions_file. Parameters ---------- predictions_file : filepath or buffer The -p output of a VW lda run num_topics : Integer The number of topics you should see Notes ----- The predictions_file contains repeated predictions...one for every pass. We parse out and include only the last predictions by looking for repeats of the first lines doc_id field. We thus, at this time, require the VW formatted file to have, in the last column, a unique doc_id associated with the doc. """ with smart_open(predictions_file) as open_file: for line_num, line in enumerate(open_file): split_line = line.split() # Currently only deal with topics + a doc_id assert len(split_line) == num_topics + 1, "Is num_topics correct?" doc_id = split_line[-1] if line_num == 0: first_doc_id = doc_id if doc_id == first_doc_id: start_line = line_num return start_line
def main(): parser = argparse.ArgumentParser(description="""Salento Graphviz trace vizualizer. The input (a Salento JSON dataset) can be filtered by its terms (the call name \n and location joined with a colon `:`); the switches are combined with a logical AND, thu prunning more the search. You can use `^` and `$` to represent the beginning and the end of a term, respectively. Example: `oo()` matches any call name OR any location that contains `oo()`. Example: `^foo` matches any call name that starts with `foo`. Example: `:30$` matches any location that ends with `:30`. Example: `^foo():file.c:30` matches any term that starts with a call name `foo()` and a location `file.c:30`. """) parser.add_argument('filename', help='input data file') parser.add_argument('--match', '-m', help='Filter in sequences that contain the given location.') parser.add_argument('--end', '-e', help='Filter in sequences that end with the given location.') parser.add_argument('--start', '-s', help='Filter in sequences that start with the given location.') parser.add_argument('--list-first', action='store_true', help="List the first tokens of the dataset.") parser.add_argument('--list-last', action='store_true', help="List the last tokens of the dataset.") parser.add_argument('--outfile', '-o', default=sys.stdout, help="Save the Graphviz file. Default: standard output.") args = parser.parse_args() with common.smart_open(args.filename) as f: js = json.load(f) if args.list_first: show_nth(js, 0) elif args.list_last: show_nth(js, -1) else: g = salento_to_trace(args, js) if args.outfile is sys.stdout: print(g.source) else: g.save(filename=args.outfile)
def load(cls, loadfile): """ Pickle SFileFilter from disk. Parameters ---------- loadfile : filepath or buffer """ with smart_open(loadfile, 'rb') as f: return cPickle.load(f)
def load(cls, loadfile): """ Unpickle class from disk. Parameters ---------- loadfile : filepath or buffer """ with smart_open(loadfile, 'rb') as f: return cPickle.load(f)
def write_packages(filename, pkgs): with common.smart_open(filename, "wt") as fp: fp.write('{"packages": [') is_first = True for pkg in pkgs: if not is_first: fp.write(',') json.dump(pkg, fp) is_first = False fp.write(']}')
def filter_sfile( self, infile, outfile, doc_id_list=None, enforce_all_doc_id=True): """ Alter an sfile by converting tokens to id values, and removing tokens not in self.token2id. Optionally filters on doc_id. Parameters ---------- infile : file path or buffer outfile : file path or buffer doc_id_list : Iterable over strings Keep only rows with doc_id in this list enforce_all_doc_id : Boolean If True (and doc_id is not None), raise exception unless all doc_id in doc_id_list are seen. """ assert self.sfile_loaded, "Must load an sfile before you can filter" if not hasattr(self, 'id2token'): self._print( "WARNING: Filtering an sfile before setting self.id2token. " "The resultant outfile will have collisions and you will not " "be able to convert ids back to tokens.\nIt is recommended to " "call: self.compactify() then either self.set_id2token() or " " self.save() before filtering") extra_filter = self._get_extra_filter(doc_id_list) with smart_open(infile) as f, smart_open(outfile, 'w') as g: # Each line represents one document for line in f: record_dict = self.formatter.sstr_to_dict(line) if extra_filter(record_dict): record_dict['feature_values'] = { self.token2id[token]: value for token, value in record_dict['feature_values'].iteritems() if token in self.token2id} new_sstr = self.formatter.get_sstr(**record_dict) g.write(new_sstr + '\n') self._done_check(enforce_all_doc_id)
def save(self, savefile, protocol=-1): """ Pickle self to outfile. Parameters ---------- savefile : filepath or buffer protocol : 0, 1, 2, -1 0 < 1 < 2 in terms of performance. -1 means use highest available. """ with smart_open(savefile, 'w') as f: cPickle.dump(self, f, protocol=protocol)
def parse_lda_topics(topics_file, num_topics, normalize=True): """ Returns a DataFrame representation of the topics output of an lda VW run. Parameters ---------- topics_file : filepath or buffer The --readable_model output of a VW lda run num_topics : Integer The number of topics in every valid row normalize : Boolean Normalize the rows so that they represent probabilities of topic given hash_val Notes ----- The trick is dealing with lack of a marker for the information printed on top, and the inconsistant delimiter choice. """ topics = {'topic_%d' % i: [] for i in range(num_topics)} topics['hash_val'] = [] # The topics file contains a bunch of informational printout stuff at # the top. Figure out what line this ends on with smart_open(topics_file, 'r') as open_file: # Once we detect that we're in the valid rows, there better not be # any exceptions! in_valid_rows = False for line in open_file: try: # If this row raises an exception, then it isn't a valid row # Sometimes trailing space...that's the reason for split() # rather than csv.reader or a direct pandas read. split_line = line.split() hash_val = int(split_line[0]) topic_weights = [float(item) for item in split_line[1:]] assert len(topic_weights) == num_topics for i, weight in enumerate(topic_weights): topics['topic_%d' % i].append(weight) topics['hash_val'].append(hash_val) in_valid_rows = True except (ValueError, IndexError, AssertionError): if in_valid_rows: raise topics = pd.DataFrame(topics).set_index('hash_val') if normalize: topics = topics.div(topics.sum(axis=1), axis=0) return topics
def convert_to_json(in_fname, out_fname, enclose_in_packages, trans): with common.smart_open(out_fname, 'wt') as out: if enclose_in_packages: out.write('{"packages":[') out.write('{"data":[') first = True for seq in translate_file(in_fname, trans): if first: first = False else: out.write(',') json.dump(seq, out) out.write('],"name":') json.dump(in_fname, out) out.write("}") if enclose_in_packages: out.write(']}')
def sfile_to_token_iter(self, filepath_or_buffer, limit=None): """ Return an iterator over filepath_or_buffer that returns, line-by-line, a token_list. Parameters ---------- filepath_or_buffer : string or file handle / StringIO. File should be formatted according to self.format. Returns ------- token_iter : Iterator E.g. token_iter.next() gets the next line as a list of tokens. """ with smart_open(filepath_or_buffer) as open_file: for index, line in enumerate(open_file): if index == limit: raise StopIteration yield self.sstr_to_token_list(line)
def parse_lda_predictions( predictions_file, num_topics, start_line, normalize=True): """ Return a DataFrame representation of a VW prediction file. Parameters ---------- predictions_file : filepath or buffer The -p output of a VW lda run num_topics : Integer The number of topics you should see start_line : Integer Start reading the predictions file here. The predictions file contains repeated predictions, one for every pass. You generally do not want every prediction. normalize : Boolean Normalize the rows so that they represent probabilities of topic given doc_id. """ predictions = {'topic_%d' % i: [] for i in range(num_topics)} predictions['doc_id'] = [] with smart_open(predictions_file) as open_file: for line_num, line in enumerate(open_file): if line_num < start_line: continue split_line = line.split() for item_num, item in enumerate(split_line): if item_num < num_topics: predictions['topic_%d' % item_num].append(float(item)) else: predictions['doc_id'].append(item) predictions = pd.DataFrame(predictions).set_index('doc_id') if normalize: predictions = predictions.div(predictions.sum(axis=1), axis=0) return predictions
def parse_varinfo(varinfo_file): """ Uses the output of the vw-varinfo utility to get a DataFrame with variable info. Parameters ---------- varinfo_file : Path or buffer The output of vw-varinfo """ with smart_open(varinfo_file) as open_file: # For some reason, pandas is confused...so just split the lines # Create a dict {item1: [...], item2: [...],...} for each item in the # header header = open_file.next().split() rows = {col_name: [] for col_name in header} for line in open_file: for i, item in enumerate(line.split()): rows[header[i]].append(item) # Create a data frame varinfo = pd.DataFrame(rows) # Format columns correctly varinfo.FeatureName = varinfo.FeatureName.str.replace('^', '') varinfo.HashVal = varinfo.HashVal.astype(int) varinfo.MaxVal = varinfo.MaxVal.astype(float) varinfo.MinVal = varinfo.MinVal.astype(float) varinfo.RelScore = ( varinfo.RelScore.str.replace('%', '').astype(float) / 100) varinfo.Weight = varinfo.Weight.astype(float) # Rename columns to decent Python names varinfo = varinfo.rename( columns={'FeatureName': 'feature_name', 'HashVal': 'hash_val', 'MaxVal': 'max_val', 'MinVal': 'min_val', 'RelScore': 'rel_score', 'Weight': 'weight'}).set_index('hash_val') return varinfo
def _load_sfile_fwd(self, sfile): """ Builds the "forward" objects involved in loading an sfile. """ token2id = {} token_score = defaultdict(float) doc_freq = defaultdict(int) num_docs = 0 hash_fun = self._get_hash_fun() with smart_open(sfile) as open_file: # Each line represents one document for line in open_file: num_docs += 1 record_dict = self.formatter.sstr_to_dict(line) for token, value in record_dict['feature_values'].iteritems(): hash_value = hash_fun(token) token2id[token] = hash_value token_score[token] += value doc_freq[token] += 1 return token2id, token_score, doc_freq, num_docs
def to_vw(self, outfile, n_jobs=1, chunksize=1000): """ Write our filestream to a VW (Vowpal Wabbit) formatted file. Parameters ---------- outfile : filepath or buffer n_jobs : Integer Use n_jobs different jobs to do the processing. Set = 4 for 4 jobs. Set = -1 to use all available, -2 for all except 1,... chunksize : Integer Workers process this many jobs at once before pickling and sending results to master. If this is too low, communication overhead will dominate. If this is too high, jobs will not be distributed evenly. """ # Note: This is similar to declass/cmd/files_to_vw.py # This implementation is more complicated, due to the fact that a # streamer specifies the method to extract doc_id from a stream. # To be faithful to the streamer, we must therefore use the streamer # to stream the files. This requires a combination of imap_easy and # a chunker. # # Create an iterator over chunks of paths path_group_iter = common.grouper(self.paths, chunksize) formatter = text_processors.VWFormatter() func = partial(_group_to_sstr, self, formatter) # Process one group at a time...set imap_easy chunksize arg to 1 # since each group contains many paths. results_iterator = imap_easy(func, path_group_iter, n_jobs, 1) with smart_open(outfile, 'w') as open_outfile: for group_results in results_iterator: for sstr in group_results: open_outfile.write(sstr + '\n')
def main(): parser = argparse.ArgumentParser() parser.add_argument('infile', help='The input JSON file.') parser.add_argument('outfile', nargs='?', default=None, help='The output file. Default: standard-output') parser.add_argument( '--min-len', default=3, type=int, help='The minimum call-sequence length accepted. Default: %(default)r') parser.add_argument( '--idf-treshold', default=.25, type=float, help= 'A percentage. Any call whose IDF is below this value will be ignored. Default: %(default).2f%%' ) parser.add_argument( '--stop-words-file', help= 'Provide a file (one term per line) with terms that must be removed from any sequence. Practically, this step removes terms from the vocabulary.' ) parser.add_argument( '--alias-file', help= 'Provide a YAML file with the alias replacing each term that matches a key per value.' ) parser.add_argument('--skip-filter-low', dest="run_tf", action="store_false", help='Disables the low-frequency filter.') parser.add_argument( '--vocabs-file', help= 'Disables the low-frequency filter. Uses the supplied vocabolary file, filtering any term that is not in the vocabulary.' ) get_nprocs = common.parser_add_parallelism(parser) args = parser.parse_args() try: if args.vocabs_file is not None: vocabs = set(parse_word_list(args.vocabs_file)) else: vocabs = None if args.alias_file is not None: import yaml alias = yaml.load(open(args.alias_file)) else: alias = None if args.stop_words_file is not None: stopwords = set(parse_word_list(args.stop_words_file)) else: stopwords = None with common.smart_open(args.infile, 'rt') as f: data = json.load(f) ds = sal.Dataset(js=data) if alias is not None and len(alias) > 0: ds.translate_calls(alias) if vocabs is not None and len(vocabs) > 0: ds.filter_vocabs(vocabs) if stopwords is not None and len(stopwords) > 0: ds.filter_stopwords(stopwords) ds.filter_sequences(min_length=args.min_len) if args.run_tf: # Additionally run the TF/IDF filter tf = get_term_frequency(data, nprocs=get_nprocs(args), min_seq_len=args.min_len) vocabs = get_common_vocabs(tf, idf_treshold=(args.idf_treshold / 100)) ds.filter_vocabs(vocabs) if args.outfile is None: json.dump(data, sys.stdout) else: with common.smart_open(args.outfile, 'wt') as f: json.dump(data, f) except KeyboardInterrupt: sys.exit(1)
def main(): args = get_args() # File containing speaker labels per segment seg2label = {} with common_lib.smart_open(args.labels) as labels_file: for line in labels_file: seg, label = line.strip().split() seg2label[seg] = label # Segments file reco2segs = {} with common_lib.smart_open(args.segments) as segments_file: for line in segments_file: seg, reco, start, end = line.strip().split() try: if reco in reco2segs: reco2segs[reco] = reco2segs[reco] + " " + start + "," + end + "," + seg2label[seg] else: reco2segs[reco] = reco + " " + start + "," + end + "," + seg2label[seg] except KeyError: raise RuntimeError("Missing label for segment {0}".format(seg)) # Cut up overlapping segments so they are contiguous contiguous_segs = [] for reco in reco2segs: segs = reco2segs[reco].strip().split() new_segs = "" for i in range(1, len(segs)-1): start, end, label = segs[i].split(',') next_start, next_end, next_label = segs[i+1].split(',') if float(end) > float(next_start): done = False avg = str((float(next_start) + float(end)) / 2.0) segs[i+1] = ','.join([avg, next_end, next_label]) new_segs += " " + start + "," + avg + "," + label else: new_segs += " " + start + "," + end + "," + label start, end, label = segs[-1].split(',') new_segs += " " + start + "," + end + "," + label contiguous_segs.append(reco + new_segs) # Merge contiguous segments of the same label merged_segs = [] for reco_line in contiguous_segs: segs = reco_line.strip().split() reco = segs[0] new_segs = "" for i in range(1, len(segs)-1): start, end, label = segs[i].split(',') next_start, next_end, next_label = segs[i+1].split(',') if float(end) == float(next_start) and label == next_label: segs[i+1] = ','.join([start, next_end, next_label]) else: new_segs += " " + start + "," + end + "," + label start, end, label = segs[-1].split(',') new_segs += " " + start + "," + end + "," + label merged_segs.append(reco + new_segs) with common_lib.smart_open(args.rttm_file, 'w') as rttm_writer: for reco_line in merged_segs: segs = reco_line.strip().split() reco = segs[0] for i in range(1, len(segs)): start, end, label = segs[i].strip().split(',') print("SPEAKER {0} 0 {1:7.3f} {2:7.3f} <NA> <NA> {3} <NA> <NA>".format( reco, float(start), float(end)-float(start), label), file=rttm_writer)
def main(): args = get_args() # File containing speaker labels per segment seg2label = {} with common_lib.smart_open(args.labels) as labels_file: for line in labels_file: seg, label = line.strip().split() seg2label[seg] = label # Segments file reco2segs = {} with common_lib.smart_open(args.segments) as segments_file: for line in segments_file: seg, reco, start, end = line.strip().split() try: if reco in reco2segs: reco2segs[reco] = reco2segs[reco] + " " + start + "," + end + "," + seg2label[seg] else: reco2segs[reco] = reco + " " + start + "," + end + "," + seg2label[seg] except KeyError: raise RuntimeError("Missing label for segment {0}".format(seg)) # Cut up overlapping segments so they are contiguous contiguous_segs = [] for reco in sorted(reco2segs): segs = reco2segs[reco].strip().split() new_segs = "" for i in range(1, len(segs)-1): start, end, label = segs[i].split(',') next_start, next_end, next_label = segs[i+1].split(',') if float(end) > float(next_start): done = False avg = str((float(next_start) + float(end)) / 2.0) segs[i+1] = ','.join([avg, next_end, next_label]) new_segs += " " + start + "," + avg + "," + label else: new_segs += " " + start + "," + end + "," + label start, end, label = segs[-1].split(',') new_segs += " " + start + "," + end + "," + label contiguous_segs.append(reco + new_segs) # Merge contiguous segments of the same label merged_segs = [] for reco_line in contiguous_segs: segs = reco_line.strip().split() reco = segs[0] new_segs = "" for i in range(1, len(segs)-1): start, end, label = segs[i].split(',') next_start, next_end, next_label = segs[i+1].split(',') if float(end) == float(next_start) and label == next_label: segs[i+1] = ','.join([start, next_end, next_label]) else: new_segs += " " + start + "," + end + "," + label start, end, label = segs[-1].split(',') new_segs += " " + start + "," + end + "," + label merged_segs.append(reco + new_segs) with common_lib.smart_open(args.rttm_file, 'w') as rttm_writer: for reco_line in merged_segs: segs = reco_line.strip().split() reco = segs[0] for i in range(1, len(segs)): start, end, label = segs[i].strip().split(',') print("SPEAKER {0} {1} {2:7.3f} {3:7.3f} <NA> <NA> {4} <NA> <NA>".format( reco, args.rttm_channel, float(start), float(end)-float(start), label), file=rttm_writer)
def main(): import argparse parser = argparse.ArgumentParser( description="Partition a Salento input file.") parser.add_argument("filename", help="The JSON filename we are processing.") parser.add_argument( "outfiles", default=None, nargs="*", help= "The output filenames. When provided these will be used rather than an format string." ) parser.add_argument("--format", default="{basename}-{idx}.json{compress}", help="Output filename template. Default: %(default)s") parser.add_argument("-j", action="store_true", help="Compress data.") parser.add_argument("-v", action="store_true", help="Print filename.") parser.add_argument( "--skip-shuffle", dest="shuffle", action="store_false", help= "Except when partitioning by package name, we shuffle which sequences appear in each partition; with this option the sequence order is preserved." ) group = parser.add_mutually_exclusive_group(required=True) group.add_argument( '--n-ways', type=int, help='Partition the dataset into a given number of files') group.add_argument('--per-package', action='store_true', help='Partition each package into a given file.') group.add_argument( '--ratio', type=parse_ratio, help= 'Partition the dataset into 2 parts, according to the ratio given by this argument.' ) args = parser.parse_args() if args.n_ways is not None: count = args.n_ways elif args.ratio is not None: count = 2 else: count = None if count is not None: filenames = get_out_files(args, count) with common.smart_open(args.filename, 'rt') as fp: js = json.load(fp) if count is None: assert args.per_package filenames = get_out_files(args, len(js['packages'])) if args.n_ways is not None: part_algo = partition_by_count elif args.ratio is not None: part_algo = partition_by_ratio else: part_algo = partition_by_package for fname in part_algo(js, filenames, args): if args.v: print(fname)