Exemplo n.º 1
0
def main(argv=None):
    '''
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    '''
    # Get command line arguments
    parser = argparse.ArgumentParser(description="Combine MegaM files that \
                                                  contain features for the same\
                                                  files.",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('megam_file',
                        help='MegaM input file(s). Each feature line must be \
                              preceded by a comment with the filename/ID that \
                              the features should be joined on.',
                        nargs='+')
    parser.add_argument('-b', '--binary',
                        help='Converts all of the features in the specified \
                              range of files to presence/absence binary \
                              features. Files are numbered starting from 1, and\
                              if 0 is specified with this flag, all files are\
                              converted.',
                        type=parse_num_list)
    parser.add_argument('--doubleup',
                        help='Keep both the binary and numeric versions of any\
                              feature you convert to binary.',
                        action='store_true')
    parser.add_argument('-c', '--common',
                        help='Only output features for filenames that are \
                              common to all MegaM files.',
                        action='store_true')
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # Map from filenames to feature strings
    feature_dict = OrderedDict()
    class_dict = {}
    filename_set = set()

    # Set that will contain all of the features seen in previous files
    # (for duplicate detection)
    prev_feature_set = set()

    # Iterate through MegaM files
    for file_num, infile in enumerate(args.megam_file, start=1):
        # Initialize duplicate feature book-keeping variables
        curr_feature_set = set()

        # Initialize set for storing filenames mentioned in current MegaM file
        curr_filename_set = set()

        # Handle current MegaM file
        for curr_filename, class_name, feature_dict in _MegaMDictIter(infile):
            if curr_filename in class_dict:
                if class_dict[curr_filename] != class_name:
                    raise ValueError(("Inconsisten class label for instance " +
                                      "{} in {}.").format(curr_filename,
                                                          infile.name))
            else:
                class_dict[curr_filename] = class_name
            # If there are non-zero features, process them
            if feature_dict:
                for feat_name, feat_val in iteritems(feature_dict):
                    # Handle duplicate features
                    feat_name = get_unique_name(feat_name, prev_feature_set,
                                                infile.name)
                    # Ignore zero-valued features
                    try:
                        if feat_val != 'N/A' and float(feat_val) != 0:
                            # Convert feature to binary if necessary
                            if (args.binary and ((args.binary == [0]) or (file_num in args.binary))):
                                if args.doubleup:
                                    new_feat_pair = '{} {} '.format(feat_name, feat_val)
                                    feature_dict[curr_filename] = new_feat_pair if curr_filename not in feature_dict else feature_dict[curr_filename] + new_feat_pair
                                    curr_feature_set.add(feat_name)
                                    feat_name = get_unique_name(feat_name + "_binary", prev_feature_set, infile.name)
                                feat_val = 1

                            # Add feature pair to current string of features
                            new_feat_pair = '{} {} '.format(feat_name,
                                                            feat_val)
                            feature_dict[curr_filename] = new_feat_pair if curr_filename not in feature_dict else feature_dict[curr_filename] + new_feat_pair
                            curr_feature_set.add(feat_name)
                    except ValueError:
                        raise ValueError(("Invalid feature value in feature " +
                                          "pair '{} {}' for file {}").format(feat_name,
                                                                             feat_val,
                                                                             curr_filename).encode('utf-8'))

            # Otherwise warn about lack of features (although that really
            # just means all of them have zero values)
            else:
                if curr_filename not in feature_dict:
                    feature_dict[curr_filename] = ""
                logger.warning(("No features found for {} in {}. All are " +
                                "assumed to be zero.").format(curr_filename,
                                                              infile.name).encode('utf-8'))

        # Add current file's features to set of seen features
        prev_feature_set.update(curr_feature_set)

        # Either intersect or union current file's filenames with existing ones
        if args.common and filename_set:
            filename_set.intersection_update(curr_filename_set)
        else:
            filename_set.update(curr_filename_set)

    # Print new MegaM file
    for curr_filename in feature_dict.viewkeys():
        # Skip files that aren't common when args.common is true
        if curr_filename not in filename_set:
            continue
        print("# {}".format(curr_filename).encode('utf-8'))
        print("{}\t{}".format(class_dict[curr_filename],
                              feature_dict[curr_filename].strip()).encode('utf-8'))
Exemplo n.º 2
0
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    """
    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Filter MegaM file to remove\
                                                  features with names in stop\
                                                  word list (or non alphabetic\
                                                  characters). Also has \
                                                  side-effect of removing TEST,\
                                                  TRAIN, and DEV lines if they\
                                                  are present.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("infile", help="MegaM input file", default="-", nargs="?")
    parser.add_argument("stopwordlist", help="Stop word file", type=argparse.FileType("r"))
    parser.add_argument("-i", "--ignorecase", help="Do case insensitive feature name matching.", action="store_true")
    parser.add_argument(
        "-k",
        "--keep",
        help="Instead of removing features with names in the\
                              list, keep only those.",
        action="store_true",
    )
    parser.add_argument("--version", action="version", version="%(prog)s {0}".format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=("%(asctime)s - %(name)s - %(levelname)s - " + "%(message)s"))

    if args.infile.isatty():
        print(
            "You are running this script interactively. Press CTRL-D at "
            + "the start of a blank line to signal the end of your input. "
            + "For help, run it with --help\n",
            file=sys.stderr,
        )

    # Read stop word list
    if args.ignorecase:
        stopwords = {w.strip().lower() for w in args.stopwordlist}
    else:
        stopwords = {w.strip() for w in args.stopwordlist}

    # Iterate through MegaM file
    for example_id, class_name, feature_dict in _MegaMDictIter(args.infile):
        if example_id is not None:
            print("# {}".format(example_id))
        print(class_name, end="\t")
        first = True
        for feature, value in iteritems(feature_dict):
            feature = feature.strip()
            if re.match(r"[\w-]*$", feature) and (
                (
                    not args.keep
                    and ((feature not in stopwords) or (args.ignorecase and (feature.lower() not in stopwords)))
                )
                or (args.keep and ((feature in stopwords) or (args.ignorecase and (feature.lower() in stopwords))))
            ):
                if first:
                    first = False
                else:
                    print(" ", end="")
                print("{} {}".format(feature, value), end="")
        print()
Exemplo n.º 3
0
def main(argv=None):
    '''
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    '''
    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Filter MegaM file to remove\
                                                  features with names in stop\
                                                  word list (or non alphabetic\
                                                  characters). Also has \
                                                  side-effect of removing TEST,\
                                                  TRAIN, and DEV lines if they\
                                                  are present.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='MegaM input file',
                        default='-',
                        nargs='?')
    parser.add_argument('stopwordlist',
                        help='Stop word file',
                        type=argparse.FileType('r'))
    parser.add_argument('-i',
                        '--ignorecase',
                        help='Do case insensitive feature name matching.',
                        action='store_true')
    parser.add_argument('-k',
                        '--keep',
                        help='Instead of removing features with names in the\
                              list, keep only those.',
                        action='store_true')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))

    if args.infile.isatty():
        print("You are running this script interactively. Press CTRL-D at " +
              "the start of a blank line to signal the end of your input. " +
              "For help, run it with --help\n",
              file=sys.stderr)

    # Read stop word list
    if args.ignorecase:
        stopwords = {w.strip().lower() for w in args.stopwordlist}
    else:
        stopwords = {w.strip() for w in args.stopwordlist}

    # Iterate through MegaM file
    for example_id, class_name, feature_dict in _MegaMDictIter(args.infile):
        if example_id is not None:
            print("# {}".format(example_id))
        print(class_name, end="\t")
        first = True
        for feature, value in iteritems(feature_dict):
            feature = feature.strip()
            if (re.match(r'[\w-]*$', feature) and
                ((not args.keep and ((feature not in stopwords) or
                                     (args.ignorecase and
                                      (feature.lower() not in stopwords)))) or
                 (args.keep and ((feature in stopwords) or
                                 (args.ignorecase and
                                  (feature.lower() in stopwords)))))):
                if first:
                    first = False
                else:
                    print(" ", end='')
                print('{} {}'.format(feature, value), end="")
        print()