示例#1
0
def find_start_line_lda_predictions(predictions_file, num_topics):
    """
    Return the line number (zero indexed) of the start of the last set of
    predictions in predictions_file.

    Parameters
    ----------
    predictions_file : filepath or buffer
        The -p output of a VW lda run
    num_topics : Integer
        The number of topics you should see

    Notes
    -----
    The predictions_file contains repeated predictions...one for every pass.
    We parse out and include only the last predictions by looking for repeats
    of the first lines doc_id field.  We thus, at this time, require the VW 
    formatted file to have, in the last column, a unique doc_id associated
    with the doc.
    """
    with smart_open(predictions_file) as open_file:
        for line_num, line in enumerate(open_file):
            split_line = line.split()
            # Currently only deal with topics + a doc_id
            assert len(split_line) == num_topics + 1, "Is num_topics correct?"
            doc_id = split_line[-1]
            if line_num == 0:
                first_doc_id = doc_id
            if doc_id == first_doc_id:
                start_line = line_num

    return start_line
示例#2
0
def main():
    parser = argparse.ArgumentParser(description="""Salento Graphviz trace vizualizer.
The input (a Salento JSON dataset) can be filtered by its terms (the call name \n
and location joined with a colon `:`); the switches are combined with
a logical AND, thu prunning more the search.

You can use `^` and `$` to represent the beginning and the end of a term,
respectively.
Example: `oo()` matches any call name OR any location that contains `oo()`.
Example: `^foo` matches any call name that starts with `foo`.
Example: `:30$` matches any location that ends with `:30`.
Example: `^foo():file.c:30` matches any term that starts with a call name
`foo()` and a location `file.c:30`.
""")
    parser.add_argument('filename', help='input data file')
    parser.add_argument('--match', '-m', help='Filter in sequences that contain the given location.')
    parser.add_argument('--end', '-e', help='Filter in sequences that end with the given location.')
    parser.add_argument('--start', '-s', help='Filter in sequences that start with the given location.')
    parser.add_argument('--list-first', action='store_true', help="List the first tokens of the dataset.")
    parser.add_argument('--list-last', action='store_true', help="List the last tokens of the dataset.")
    parser.add_argument('--outfile', '-o', default=sys.stdout, help="Save the Graphviz file. Default: standard output.")
    args = parser.parse_args()
    with common.smart_open(args.filename) as f:
        js = json.load(f)
    if args.list_first:
        show_nth(js, 0)
    elif args.list_last:
        show_nth(js, -1)
    else:
        g = salento_to_trace(args, js)
        if args.outfile is sys.stdout:
            print(g.source)
        else:
            g.save(filename=args.outfile)
示例#3
0
    def load(cls, loadfile):
        """
        Pickle SFileFilter from disk.

        Parameters
        ----------
        loadfile : filepath or buffer
        """
        with smart_open(loadfile, 'rb') as f:
            return cPickle.load(f)
示例#4
0
    def load(cls, loadfile):
        """
        Unpickle class from disk.

        Parameters
        ----------
        loadfile : filepath or buffer
        """
        with smart_open(loadfile, 'rb') as f:
            return cPickle.load(f)
示例#5
0
def write_packages(filename, pkgs):
    with common.smart_open(filename, "wt") as fp:
        fp.write('{"packages": [')
        is_first = True
        for pkg in pkgs:
            if not is_first:
                fp.write(',')
            json.dump(pkg, fp)
            is_first = False
        fp.write(']}')
示例#6
0
    def filter_sfile(
        self, infile, outfile, doc_id_list=None, enforce_all_doc_id=True):
        """
        Alter an sfile by converting tokens to id values, and removing tokens
        not in self.token2id.  Optionally filters on doc_id.

        Parameters
        ----------
        infile : file path or buffer
        outfile : file path or buffer
        doc_id_list : Iterable over strings
            Keep only rows with doc_id in this list
        enforce_all_doc_id : Boolean
            If True (and doc_id is not None), raise exception unless all doc_id
            in doc_id_list are seen.
        """
        assert self.sfile_loaded, "Must load an sfile before you can filter"
        if not hasattr(self, 'id2token'):
            self._print(
                "WARNING:  Filtering an sfile before setting self.id2token.  "
                "The resultant outfile will have collisions and you will not "
                "be able to convert ids back to tokens.\nIt is recommended to "
                "call: self.compactify() then either self.set_id2token() or "
                " self.save() before filtering")

        extra_filter = self._get_extra_filter(doc_id_list)

        with smart_open(infile) as f, smart_open(outfile, 'w') as g:
            # Each line represents one document
            for line in f:
                record_dict = self.formatter.sstr_to_dict(line)
                if extra_filter(record_dict):
                    record_dict['feature_values'] = {
                        self.token2id[token]: value 
                        for token, value
                        in record_dict['feature_values'].iteritems() 
                        if token in self.token2id}
                    new_sstr = self.formatter.get_sstr(**record_dict)
                    g.write(new_sstr + '\n')

        self._done_check(enforce_all_doc_id)
示例#7
0
    def save(self, savefile, protocol=-1):
        """
        Pickle self to outfile.

        Parameters
        ----------
        savefile : filepath or buffer
        protocol : 0, 1, 2, -1
            0 < 1 < 2 in terms of performance.  -1 means use highest available.
        """
        with smart_open(savefile, 'w') as f:
            cPickle.dump(self, f, protocol=protocol)
示例#8
0
    def save(self, savefile, protocol=-1):
        """
        Pickle self to outfile.

        Parameters
        ----------
        savefile : filepath or buffer
        protocol : 0, 1, 2, -1
            0 < 1 < 2 in terms of performance.  -1 means use highest available.
        """
        with smart_open(savefile, 'w') as f:
            cPickle.dump(self, f, protocol=protocol)
示例#9
0
def parse_lda_topics(topics_file, num_topics, normalize=True):
    """
    Returns a DataFrame representation of the topics output of an lda VW run.

    Parameters
    ----------
    topics_file : filepath or buffer
        The --readable_model output of a VW lda run
    num_topics : Integer
        The number of topics in every valid row
    normalize : Boolean
        Normalize the rows so that they represent probabilities of topic
        given hash_val

    Notes
    -----
    The trick is dealing with lack of a marker for the information printed
    on top, and the inconsistant delimiter choice.
    """
    topics = {'topic_%d' % i: [] for i in range(num_topics)}
    topics['hash_val'] = []
    # The topics file contains a bunch of informational printout stuff at
    # the top.  Figure out what line this ends on
    with smart_open(topics_file, 'r') as open_file:
        # Once we detect that we're in the valid rows, there better not be
        # any exceptions!
        in_valid_rows = False
        for line in open_file:
            try:
                # If this row raises an exception, then it isn't a valid row
                # Sometimes trailing space...that's the reason for split()
                # rather than csv.reader or a direct pandas read.
                split_line = line.split()
                hash_val = int(split_line[0])
                topic_weights = [float(item) for item in split_line[1:]]
                assert len(topic_weights) == num_topics
                for i, weight in enumerate(topic_weights):
                    topics['topic_%d' % i].append(weight)
                topics['hash_val'].append(hash_val)
                in_valid_rows = True
            except (ValueError, IndexError, AssertionError):
                if in_valid_rows:
                    raise

    topics = pd.DataFrame(topics).set_index('hash_val')
    if normalize:
        topics = topics.div(topics.sum(axis=1), axis=0)

    return topics
示例#10
0
def convert_to_json(in_fname, out_fname, enclose_in_packages, trans):
    with common.smart_open(out_fname, 'wt') as out:
        if enclose_in_packages:
            out.write('{"packages":[')
        out.write('{"data":[')
        first = True
        for seq in translate_file(in_fname, trans):
            if first:
                first = False
            else:
                out.write(',')
            json.dump(seq, out)
        out.write('],"name":')
        json.dump(in_fname, out)
        out.write("}")
        if enclose_in_packages:
            out.write(']}')
示例#11
0
    def sfile_to_token_iter(self, filepath_or_buffer, limit=None):
        """
        Return an iterator over filepath_or_buffer that returns, line-by-line,
        a token_list.

        Parameters
        ----------
        filepath_or_buffer : string or file handle / StringIO.
            File should be formatted according to self.format.

        Returns
        -------
        token_iter : Iterator
            E.g. token_iter.next() gets the next line as a list of tokens.
        """
        with smart_open(filepath_or_buffer) as open_file:
            for index, line in enumerate(open_file):
                if index == limit:
                    raise StopIteration
                yield self.sstr_to_token_list(line)
示例#12
0
def parse_lda_predictions(
    predictions_file, num_topics, start_line, normalize=True):
    """
    Return a DataFrame representation of a VW prediction file.

    Parameters
    ----------
    predictions_file : filepath or buffer
        The -p output of a VW lda run
    num_topics : Integer
        The number of topics you should see
    start_line : Integer
        Start reading the predictions file here.
        The predictions file contains repeated predictions, one for every pass.
        You generally do not want every prediction.
    normalize : Boolean
        Normalize the rows so that they represent probabilities of topic
        given doc_id.
    """
    predictions = {'topic_%d' % i: [] for i in range(num_topics)}
    predictions['doc_id'] = []

    with smart_open(predictions_file) as open_file:
        for line_num, line in enumerate(open_file):
            if line_num < start_line:
                continue
            split_line = line.split()
            for item_num, item in enumerate(split_line):
                if item_num < num_topics:
                    predictions['topic_%d' % item_num].append(float(item))
                else:
                    predictions['doc_id'].append(item)

    predictions = pd.DataFrame(predictions).set_index('doc_id')
    if normalize:
        predictions = predictions.div(predictions.sum(axis=1), axis=0)

    return predictions
示例#13
0
def parse_varinfo(varinfo_file):
    """
    Uses the output of the vw-varinfo utility to get a DataFrame with variable
    info.

    Parameters
    ----------
    varinfo_file : Path or buffer
        The output of vw-varinfo
    """
    with smart_open(varinfo_file) as open_file:
        # For some reason, pandas is confused...so just split the lines
        # Create a dict {item1: [...], item2: [...],...} for each item in the
        # header
        header = open_file.next().split()
        rows = {col_name: [] for col_name in header}
        for line in open_file:
            for i, item in enumerate(line.split()):
                rows[header[i]].append(item)

    # Create a data frame
    varinfo = pd.DataFrame(rows)
    # Format columns correctly
    varinfo.FeatureName = varinfo.FeatureName.str.replace('^', '')
    varinfo.HashVal = varinfo.HashVal.astype(int)
    varinfo.MaxVal = varinfo.MaxVal.astype(float)
    varinfo.MinVal = varinfo.MinVal.astype(float)
    varinfo.RelScore = (
        varinfo.RelScore.str.replace('%', '').astype(float) / 100)
    varinfo.Weight = varinfo.Weight.astype(float)

    # Rename columns to decent Python names
    varinfo = varinfo.rename(
        columns={'FeatureName': 'feature_name', 'HashVal': 'hash_val',
            'MaxVal': 'max_val', 'MinVal': 'min_val', 'RelScore': 'rel_score',
            'Weight': 'weight'}).set_index('hash_val')

    return varinfo
示例#14
0
    def _load_sfile_fwd(self, sfile):
        """
        Builds the "forward" objects involved in loading an sfile.
        """
        token2id = {}
        token_score = defaultdict(float)
        doc_freq = defaultdict(int)
        num_docs = 0

        hash_fun = self._get_hash_fun()

        with smart_open(sfile) as open_file:
            # Each line represents one document
            for line in open_file:
                num_docs += 1
                record_dict = self.formatter.sstr_to_dict(line)
                for token, value in record_dict['feature_values'].iteritems():
                    hash_value = hash_fun(token)
                    token2id[token] = hash_value
                    token_score[token] += value
                    doc_freq[token] += 1

        return token2id, token_score, doc_freq, num_docs
示例#15
0
    def to_vw(self, outfile, n_jobs=1, chunksize=1000):
        """
        Write our filestream to a VW (Vowpal Wabbit) formatted file.

        Parameters
        ----------
        outfile : filepath or buffer
        n_jobs : Integer
            Use n_jobs different jobs to do the processing.  Set = 4 for 4 
            jobs.  Set = -1 to use all available, -2 for all except 1,...
        chunksize : Integer
            Workers process this many jobs at once before pickling and sending
            results to master.  If this is too low, communication overhead
            will dominate.  If this is too high, jobs will not be distributed
            evenly.
        """
        # Note:  This is similar to declass/cmd/files_to_vw.py
        # This implementation is more complicated, due to the fact that a
        # streamer specifies the method to extract doc_id from a stream.
        # To be faithful to the streamer, we must therefore use the streamer
        # to stream the files.  This requires a combination of imap_easy and
        # a chunker.
        #
        # Create an iterator over chunks of paths
        path_group_iter = common.grouper(self.paths, chunksize)

        formatter = text_processors.VWFormatter()

        func = partial(_group_to_sstr, self, formatter)
        # Process one group at a time...set imap_easy chunksize arg to 1
        # since each group contains many paths.
        results_iterator = imap_easy(func, path_group_iter, n_jobs, 1)

        with smart_open(outfile, 'w') as open_outfile:
            for group_results in results_iterator:
                for sstr in group_results:
                    open_outfile.write(sstr + '\n')
示例#16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('infile', help='The input JSON file.')
    parser.add_argument('outfile',
                        nargs='?',
                        default=None,
                        help='The output file. Default: standard-output')
    parser.add_argument(
        '--min-len',
        default=3,
        type=int,
        help='The minimum call-sequence length accepted. Default: %(default)r')
    parser.add_argument(
        '--idf-treshold',
        default=.25,
        type=float,
        help=
        'A percentage. Any call whose IDF is below this value will be ignored. Default: %(default).2f%%'
    )
    parser.add_argument(
        '--stop-words-file',
        help=
        'Provide a file (one term per line) with terms that must be removed from any sequence. Practically, this step removes terms from the vocabulary.'
    )
    parser.add_argument(
        '--alias-file',
        help=
        'Provide a YAML file with the alias replacing each term that matches a key per value.'
    )
    parser.add_argument('--skip-filter-low',
                        dest="run_tf",
                        action="store_false",
                        help='Disables the low-frequency filter.')
    parser.add_argument(
        '--vocabs-file',
        help=
        'Disables the low-frequency filter. Uses the supplied vocabolary file, filtering any term that is not in the vocabulary.'
    )
    get_nprocs = common.parser_add_parallelism(parser)

    args = parser.parse_args()

    try:
        if args.vocabs_file is not None:
            vocabs = set(parse_word_list(args.vocabs_file))
        else:
            vocabs = None

        if args.alias_file is not None:
            import yaml
            alias = yaml.load(open(args.alias_file))
        else:
            alias = None

        if args.stop_words_file is not None:
            stopwords = set(parse_word_list(args.stop_words_file))
        else:
            stopwords = None

        with common.smart_open(args.infile, 'rt') as f:
            data = json.load(f)

        ds = sal.Dataset(js=data)
        if alias is not None and len(alias) > 0:
            ds.translate_calls(alias)
        if vocabs is not None and len(vocabs) > 0:
            ds.filter_vocabs(vocabs)
        if stopwords is not None and len(stopwords) > 0:
            ds.filter_stopwords(stopwords)

        ds.filter_sequences(min_length=args.min_len)

        if args.run_tf:
            # Additionally run the TF/IDF filter
            tf = get_term_frequency(data,
                                    nprocs=get_nprocs(args),
                                    min_seq_len=args.min_len)
            vocabs = get_common_vocabs(tf,
                                       idf_treshold=(args.idf_treshold / 100))
            ds.filter_vocabs(vocabs)

        if args.outfile is None:
            json.dump(data, sys.stdout)
        else:
            with common.smart_open(args.outfile, 'wt') as f:
                json.dump(data, f)
    except KeyboardInterrupt:
        sys.exit(1)
示例#17
0
def main():
  args = get_args()

  # File containing speaker labels per segment
  seg2label = {}
  with common_lib.smart_open(args.labels) as labels_file:
    for line in labels_file:
      seg, label = line.strip().split()
      seg2label[seg] = label

  # Segments file
  reco2segs = {}
  with common_lib.smart_open(args.segments) as segments_file:
    for line in segments_file:
      seg, reco, start, end = line.strip().split()
      try:
        if reco in reco2segs:
          reco2segs[reco] = reco2segs[reco] + " " + start + "," + end + "," + seg2label[seg]
        else:
          reco2segs[reco] = reco + " " + start + "," + end + "," + seg2label[seg]
      except KeyError:
        raise RuntimeError("Missing label for segment {0}".format(seg))

  # Cut up overlapping segments so they are contiguous
  contiguous_segs = []
  for reco in reco2segs:
    segs = reco2segs[reco].strip().split()
    new_segs = ""
    for i in range(1, len(segs)-1):
      start, end, label = segs[i].split(',')
      next_start, next_end, next_label = segs[i+1].split(',')
      if float(end) > float(next_start):
        done = False
        avg = str((float(next_start) + float(end)) / 2.0)
        segs[i+1] = ','.join([avg, next_end, next_label])
        new_segs += " " + start + "," + avg + "," + label
      else:
        new_segs += " " + start + "," + end + "," + label
    start, end, label = segs[-1].split(',')
    new_segs += " " + start + "," + end + "," + label
    contiguous_segs.append(reco + new_segs)

  # Merge contiguous segments of the same label
  merged_segs = []
  for reco_line in contiguous_segs:
    segs = reco_line.strip().split()
    reco = segs[0]
    new_segs = ""
    for i in range(1, len(segs)-1):
      start, end, label = segs[i].split(',')
      next_start, next_end, next_label = segs[i+1].split(',')
      if float(end) == float(next_start) and label == next_label:
        segs[i+1] = ','.join([start, next_end, next_label])
      else:
        new_segs += " " + start + "," + end + "," + label
    start, end, label = segs[-1].split(',')
    new_segs += " " + start + "," + end + "," + label
    merged_segs.append(reco + new_segs)

  with common_lib.smart_open(args.rttm_file, 'w') as rttm_writer:
    for reco_line in merged_segs:
      segs = reco_line.strip().split()
      reco = segs[0]
      for i in range(1, len(segs)):
        start, end, label = segs[i].strip().split(',')
        print("SPEAKER {0} 0 {1:7.3f} {2:7.3f} <NA> <NA> {3} <NA> <NA>".format(
          reco, float(start), float(end)-float(start), label), file=rttm_writer)
示例#18
0
文件: make_rttm.py 项目: mmz211/kaldi
def main():
  args = get_args()

  # File containing speaker labels per segment
  seg2label = {}
  with common_lib.smart_open(args.labels) as labels_file:
    for line in labels_file:
      seg, label = line.strip().split()
      seg2label[seg] = label

  # Segments file
  reco2segs = {}
  with common_lib.smart_open(args.segments) as segments_file:
    for line in segments_file:
      seg, reco, start, end = line.strip().split()
      try:
        if reco in reco2segs:
          reco2segs[reco] = reco2segs[reco] + " " + start + "," + end + "," + seg2label[seg]
        else:
          reco2segs[reco] = reco + " " + start + "," + end + "," + seg2label[seg]
      except KeyError:
        raise RuntimeError("Missing label for segment {0}".format(seg))

  # Cut up overlapping segments so they are contiguous
  contiguous_segs = []
  for reco in sorted(reco2segs):
    segs = reco2segs[reco].strip().split()
    new_segs = ""
    for i in range(1, len(segs)-1):
      start, end, label = segs[i].split(',')
      next_start, next_end, next_label = segs[i+1].split(',')
      if float(end) > float(next_start):
        done = False
        avg = str((float(next_start) + float(end)) / 2.0)
        segs[i+1] = ','.join([avg, next_end, next_label])
        new_segs += " " + start + "," + avg + "," + label
      else:
        new_segs += " " + start + "," + end + "," + label
    start, end, label = segs[-1].split(',')
    new_segs += " " + start + "," + end + "," + label
    contiguous_segs.append(reco + new_segs)

  # Merge contiguous segments of the same label
  merged_segs = []
  for reco_line in contiguous_segs:
    segs = reco_line.strip().split()
    reco = segs[0]
    new_segs = ""
    for i in range(1, len(segs)-1):
      start, end, label = segs[i].split(',')
      next_start, next_end, next_label = segs[i+1].split(',')
      if float(end) == float(next_start) and label == next_label:
        segs[i+1] = ','.join([start, next_end, next_label])
      else:
        new_segs += " " + start + "," + end + "," + label
    start, end, label = segs[-1].split(',')
    new_segs += " " + start + "," + end + "," + label
    merged_segs.append(reco + new_segs)

  with common_lib.smart_open(args.rttm_file, 'w') as rttm_writer:
    for reco_line in merged_segs:
      segs = reco_line.strip().split()
      reco = segs[0]
      for i in range(1, len(segs)):
        start, end, label = segs[i].strip().split(',')
        print("SPEAKER {0} {1} {2:7.3f} {3:7.3f} <NA> <NA> {4} <NA> <NA>".format(
          reco, args.rttm_channel, float(start), float(end)-float(start), label), file=rttm_writer)
示例#19
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        description="Partition a Salento input file.")
    parser.add_argument("filename",
                        help="The JSON filename we are processing.")
    parser.add_argument(
        "outfiles",
        default=None,
        nargs="*",
        help=
        "The output filenames. When provided these will be used rather than an format string."
    )
    parser.add_argument("--format",
                        default="{basename}-{idx}.json{compress}",
                        help="Output filename template. Default: %(default)s")
    parser.add_argument("-j", action="store_true", help="Compress data.")
    parser.add_argument("-v", action="store_true", help="Print filename.")
    parser.add_argument(
        "--skip-shuffle",
        dest="shuffle",
        action="store_false",
        help=
        "Except when partitioning by package name, we shuffle which sequences appear in each partition; with this option the sequence order is preserved."
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        '--n-ways',
        type=int,
        help='Partition the dataset into a given number of files')
    group.add_argument('--per-package',
                       action='store_true',
                       help='Partition each package into a given file.')
    group.add_argument(
        '--ratio',
        type=parse_ratio,
        help=
        'Partition the dataset into 2 parts, according to the ratio given by this argument.'
    )
    args = parser.parse_args()

    if args.n_ways is not None:
        count = args.n_ways
    elif args.ratio is not None:
        count = 2
    else:
        count = None

    if count is not None:
        filenames = get_out_files(args, count)

    with common.smart_open(args.filename, 'rt') as fp:
        js = json.load(fp)
        if count is None:
            assert args.per_package
            filenames = get_out_files(args, len(js['packages']))

        if args.n_ways is not None:
            part_algo = partition_by_count
        elif args.ratio is not None:
            part_algo = partition_by_ratio
        else:
            part_algo = partition_by_package

        for fname in part_algo(js, filenames, args):
            if args.v:
                print(fname)