Пример #1
0
def get_results(pages=300):

    for ml in TAGS:

        ml_results = []
        ml_links = set()

        trigrams = (
            Trigram.select()
            .where((Trigram.tag1 == ml) | (Trigram.tag2 == ml) | (Trigram.tag3 == ml))
            .order_by(Trigram.count.desc())
        )

        for tg in trigrams:

            query = " ".join([tg.tag1, tg.tag2, tg.tag3]) + " tutorial"
            res = fetch_results(query)

            for rank, r in enumerate(res, 1):
                save_page(ml, query, r["link"], rank, r["title"])

            for r in res:
                if r["link"] not in ml_links:
                    ml_results.append(r)
                    ml_links.add(r["link"])

            if len(ml_results) >= pages:
                break

        write_results_file(os.path.join("search_results", ml + "-results.json"), ml_results)
Пример #2
0
                    tg = Trigram.create(tag1=tags_ord[0], tag2=tags_ord[1], tag3=tags_ord[2])
                except peewee.IntegrityError:
                    tg = Trigram.get(
                        Trigram.tag1 == tags_ord[0],
                        Trigram.tag2 == tags_ord[1],
                        Trigram.tag3 == tags_ord[2],
                    )
                tg.count = i['count']
                tg.save()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Download n-grams of StackOverflow tags")
    parser.add_argument('--init', action='store_true', help='create database tables')
    parser.add_argument('--tri', help='print top 3-grams for a tag')
    args = parser.parse_args()

    if args.tri:
        tgs = Trigram.select().where(
            (Trigram.tag1 == args.tri) |
            (Trigram.tag2 == args.tri) |
            (Trigram.tag3 == args.tri)
        ).order_by(Trigram.count.desc()).limit(10)
        for tg in tgs:
            print ' '.join([tg.tag1, tg.tag2, tg.tag3])
    else:
        if args.init:
            create_tables()
        fetch_bigrams()
        fetch_trigrams()