Пример #1
0
def main(args):

    from argparse import ArgumentParser
    from simplelogsetter import SimpleLogSetter
    
    parser = ArgumentParser(description=__doc__)
    parser.add_argument("-v", "--verbose", dest="verbosity", default=0, action="count",
                      help="Verbosity.  Invoke many times for higher verbosity")
    parser.add_argument("-d", "--dictionary", dest="dictionary", required=True,
                      help="Dictionary file")
    parser.add_argument("-m", "--model", dest="model", required=True,
                      help="Model file")
    parser.add_argument("-s", "--show-examples", dest="showExamples", type=int, default=None,
                      help="Show n examples that rank high in each topic")
    parser.add_argument("-e", "--english-only", dest="filterEnglishOnly", default=False, action="store_true",
                      help="Filter corpus and only let through english lyrics")
    parser.add_argument("--elastic-search", dest="elasticSearch", default=False, action="store_true",
                      help="Get data from elastic search instead of a file")
    parser.add_argument("lyrics", nargs="?",
                      help="File to load lyrics from")


    parameters = parser.parse_args(args)
    logger = SimpleLogSetter(verbosity=parameters.verbosity)
    logger.startLogging()

    dictionaryPath = os.path.expanduser(parameters.dictionary)
    logging.info("Loading dictionary from %s" % dictionaryPath)
    dictionary = corpora.Dictionary.load(dictionaryPath)

    modelPath = os.path.expanduser(parameters.model)
    logging.info("Loading model from %s" % modelPath)
    model = models.ldamodel.LdaModel.load(modelPath)

    if parameters.elasticSearch:
        lyricsGenerator = getLyricsFromElasticSearch()
    else:
        lyricsGenerator = getLyricsFromJson(os.path.expanduser(parameters.lyrics))

    for index, songLyrics in enumerate(lyricsGenerator):
        frequencied = dictionary.doc2bow(cleanLyrics(songLyrics), allow_update=False)
        if not parameters.filterEnglishOnly or isItEnglish(frequencied, dictionary):
            distribution = model[frequencied]
            print distribution
            print songLyrics            


        if index > 30:
            break

    lyricRatings = {}
    




    return 0
Пример #2
0
def main(args):

    from argparse import ArgumentParser
    from simplelogsetter import SimpleLogSetter

    parser = ArgumentParser(description=__doc__)
    parser.add_argument(
        "-v",
        "--verbose",
        dest="verbosity",
        default=0,
        action="count",
        help="Verbosity.  Invoke many times for higher verbosity")
    parser.add_argument("-d",
                        "--dictionary",
                        dest="dictionary",
                        required=True,
                        help="Dictionary file")
    parser.add_argument("-m",
                        "--model",
                        dest="model",
                        required=True,
                        help="Model file")
    parser.add_argument("-s",
                        "--show-examples",
                        dest="showExamples",
                        type=int,
                        default=None,
                        help="Show n examples that rank high in each topic")
    parser.add_argument(
        "-e",
        "--english-only",
        dest="filterEnglishOnly",
        default=False,
        action="store_true",
        help="Filter corpus and only let through english lyrics")
    parser.add_argument("--elastic-search",
                        dest="elasticSearch",
                        default=False,
                        action="store_true",
                        help="Get data from elastic search instead of a file")
    parser.add_argument("lyrics", nargs="?", help="File to load lyrics from")

    parameters = parser.parse_args(args)
    logger = SimpleLogSetter(verbosity=parameters.verbosity)
    logger.startLogging()

    dictionaryPath = os.path.expanduser(parameters.dictionary)
    logging.info("Loading dictionary from %s" % dictionaryPath)
    dictionary = corpora.Dictionary.load(dictionaryPath)

    modelPath = os.path.expanduser(parameters.model)
    logging.info("Loading model from %s" % modelPath)
    model = models.ldamodel.LdaModel.load(modelPath)

    if parameters.elasticSearch:
        lyricsGenerator = getLyricsFromElasticSearch()
    else:
        lyricsGenerator = getLyricsFromJson(
            os.path.expanduser(parameters.lyrics))

    for index, songLyrics in enumerate(lyricsGenerator):
        frequencied = dictionary.doc2bow(cleanLyrics(songLyrics),
                                         allow_update=False)
        if not parameters.filterEnglishOnly or isItEnglish(
                frequencied, dictionary):
            distribution = model[frequencied]
            print distribution
            print songLyrics

        if index > 30:
            break

    lyricRatings = {}

    return 0
Пример #3
0
def main(args):

    from argparse import ArgumentParser
    from simplelogsetter import SimpleLogSetter
    
    parser = ArgumentParser(description=__doc__)
    parser.add_argument("-v", "--verbose", dest="verbosity", default=0, action="count",
                      help="Verbosity.  Invoke many times for higher verbosity")
    parser.add_argument("-c", "--corpus", dest="corpus", required=True,
                      help="Corpus file to save to/load from")
    parser.add_argument("-d", "--dictionary", dest="dictionary", required=True,
                      help="Dictionary file to save to/load from")
    parser.add_argument("-m", "--model", dest="model", required=True,
                      help="Model file to save to/load from")
    parser.add_argument("-t", "--topics", dest="topics", required=True, type=int,
                      help="Number of topics")
    parser.add_argument("-p", "--passes", dest="passes", type=int, default=1,
                      help="How many passes of the data (default: %(default)s)")    
    parser.add_argument("-e", "--english-only", dest="filterEnglishOnly", default=False, action="store_true",
                      help="Filter corpus and only let through english lyrics")
    parser.add_argument("--save-filtered", dest="filteredCorpus", default=None,
                      help="Save the filtered corpus to this filename")
    parser.add_argument("-s", "--filter-stop", dest="filterStopWords", default=False, action="store_true",
                      help="Filter stop words before modelling (only implemented for english at the moment)")
    parser.add_argument("--elastic-search", dest="elasticSearch", default=False, action="store_true",
                      help="Get data from elastic search instead of a file")
    parser.add_argument("lyrics", nargs="?",
                      help="File to load lyrics from")



    parameters = parser.parse_args(args)


    logger = SimpleLogSetter(verbosity=parameters.verbosity)
    logger.startLogging()

    dictionaryPath = os.path.expanduser(parameters.dictionary)
    if os.path.exists(dictionaryPath):
        logging.info("Loading dictionary from %s" % dictionaryPath)
        dictionary = corpora.Dictionary.load(dictionaryPath)
        allowUpdate = False
    else:
        dictionary = corpora.Dictionary()
        allowUpdate = True

    corpusPath = os.path.expanduser(parameters.corpus)
    if len(dictionary) > 0 and os.path.exists(corpusPath):
        logging.info("Loading corpus from %s" % corpusPath)
        allLyrics = corpora.MmCorpus(corpusPath)
    else:

        if parameters.elasticSearch:
            lyricsGenerator = getLyricsFromElasticSearch()
        else:
            lyricsGenerator = getLyricsFromJson(os.path.expanduser(parameters.lyrics))

        allLyrics = []
        for songLyrics in lyricsGenerator:
            allLyrics.append(dictionary.doc2bow(cleanLyrics(songLyrics), allow_update=allowUpdate))

        corpora.MmCorpus.serialize(corpusPath, allLyrics)

        if allowUpdate:
            logging.info("Saving dictionary to %s" % dictionaryPath)
            dictionary.save(dictionaryPath)


    if parameters.filterEnglishOnly:
        logging.info("Filtering for english only")
        allLyrics = englishOnly(allLyrics, dictionary)

        if parameters.filteredCorpus:
            corpora.MmCorpus.serialize(os.path.expanduser(parameters.filteredCorpus), allLyrics)

    if parameters.filterStopWords:
        logging.info("Filtering out stop words")
        allLyrics = filterFrequentWords(allLyrics, dictionary)

    model = models.ldamodel.LdaModel(corpus=allLyrics, id2word=dictionary, num_topics=parameters.topics, 
                                     passes=parameters.passes)
    model.save(os.path.expanduser(parameters.model))

    return 0
Пример #4
0
def main(args):

    from argparse import ArgumentParser
    from simplelogsetter import SimpleLogSetter

    parser = ArgumentParser(description=__doc__)
    parser.add_argument(
        "-v",
        "--verbose",
        dest="verbosity",
        default=0,
        action="count",
        help="Verbosity.  Invoke many times for higher verbosity")
    parser.add_argument("-c",
                        "--corpus",
                        dest="corpus",
                        required=True,
                        help="Corpus file to save to/load from")
    parser.add_argument("-d",
                        "--dictionary",
                        dest="dictionary",
                        required=True,
                        help="Dictionary file to save to/load from")
    parser.add_argument("-m",
                        "--model",
                        dest="model",
                        required=True,
                        help="Model file to save to/load from")
    parser.add_argument("-t",
                        "--topics",
                        dest="topics",
                        required=True,
                        type=int,
                        help="Number of topics")
    parser.add_argument(
        "-p",
        "--passes",
        dest="passes",
        type=int,
        default=1,
        help="How many passes of the data (default: %(default)s)")
    parser.add_argument(
        "-e",
        "--english-only",
        dest="filterEnglishOnly",
        default=False,
        action="store_true",
        help="Filter corpus and only let through english lyrics")
    parser.add_argument("--save-filtered",
                        dest="filteredCorpus",
                        default=None,
                        help="Save the filtered corpus to this filename")
    parser.add_argument(
        "-s",
        "--filter-stop",
        dest="filterStopWords",
        default=False,
        action="store_true",
        help=
        "Filter stop words before modelling (only implemented for english at the moment)"
    )
    parser.add_argument("--elastic-search",
                        dest="elasticSearch",
                        default=False,
                        action="store_true",
                        help="Get data from elastic search instead of a file")
    parser.add_argument("lyrics", nargs="?", help="File to load lyrics from")

    parameters = parser.parse_args(args)

    logger = SimpleLogSetter(verbosity=parameters.verbosity)
    logger.startLogging()

    dictionaryPath = os.path.expanduser(parameters.dictionary)
    if os.path.exists(dictionaryPath):
        logging.info("Loading dictionary from %s" % dictionaryPath)
        dictionary = corpora.Dictionary.load(dictionaryPath)
        allowUpdate = False
    else:
        dictionary = corpora.Dictionary()
        allowUpdate = True

    corpusPath = os.path.expanduser(parameters.corpus)
    if len(dictionary) > 0 and os.path.exists(corpusPath):
        logging.info("Loading corpus from %s" % corpusPath)
        allLyrics = corpora.MmCorpus(corpusPath)
    else:

        if parameters.elasticSearch:
            lyricsGenerator = getLyricsFromElasticSearch()
        else:
            lyricsGenerator = getLyricsFromJson(
                os.path.expanduser(parameters.lyrics))

        allLyrics = []
        for songLyrics in lyricsGenerator:
            allLyrics.append(
                dictionary.doc2bow(cleanLyrics(songLyrics),
                                   allow_update=allowUpdate))

        corpora.MmCorpus.serialize(corpusPath, allLyrics)

        if allowUpdate:
            logging.info("Saving dictionary to %s" % dictionaryPath)
            dictionary.save(dictionaryPath)

    if parameters.filterEnglishOnly:
        logging.info("Filtering for english only")
        allLyrics = englishOnly(allLyrics, dictionary)

        if parameters.filteredCorpus:
            corpora.MmCorpus.serialize(
                os.path.expanduser(parameters.filteredCorpus), allLyrics)

    if parameters.filterStopWords:
        logging.info("Filtering out stop words")
        allLyrics = filterFrequentWords(allLyrics, dictionary)

    model = models.ldamodel.LdaModel(corpus=allLyrics,
                                     id2word=dictionary,
                                     num_topics=parameters.topics,
                                     passes=parameters.passes)
    model.save(os.path.expanduser(parameters.model))

    return 0