示例#1
0
 args = parser.parse_args()
 models = args.models
 print("Models names passed in Arguments:", str(models))
 # models = 'BART,T5,PEGASUS-CNN,PEGASUS-MED'
 models = models.split(',')
 if 'BART' in models:
     print('Model files downloading for BART')
     bart_model = BartForConditionalGeneration.from_pretrained(
         'facebook/bart-large-cnn')
     bart_tokenizer = BartTokenizer.from_pretrained(
         'facebook/bart-large-cnn')
     bart_model.to(device)
     bart_model.eval()
 if 'T5' in models:
     print('Model files downloading for T5')
     t5_model = T5ForConditionalGeneration.from_pretrained('t5-large')
     t5_tokenizer = T5Tokenizer.from_pretrained('t5-large')
     t5_model.to(device)
     t5_model.eval()
 if 'PEGASUS-CNN' in models:
     print('Model files downloading for PEGASUS-CNN')
     pegasus_cnn_model = PegasusForConditionalGeneration.from_pretrained(
         'google/pegasus-cnn_dailymail')
     pegasus_cnn_tokenizer = PegasusTokenizer.from_pretrained(
         'google/pegasus-cnn_dailymail')
     pegasus_cnn_model.to(device)
     pegasus_cnn_model.eval()
 if 'PEGASUS-MED' in models:
     print('Model files downloading for PEGASUS-MED')
     pegasus_med_model = PegasusForConditionalGeneration.from_pretrained(
         'google/pegasus-pubmed')
示例#2
0
import json
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from summarizer import Summarizer


BART_PATH = 'bart-large'
T5_PATH = 't5-base'
# BART_PATH = 'model/bart'
# T5_PATH = 'model/t5'

app = Flask(__name__)
bart_model = Summarizer()



t5_model = T5ForConditionalGeneration.from_pretrained(T5_PATH)
t5_tokenizer = T5Tokenizer.from_pretrained(T5_PATH)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def bart_summarize(input_text, num_beams=4, num_words=50):
    
    input_text = str(input_text)
    result = bart_model(input_text, min_length=50,max_length=100)
    output = ''.join(result)
  
    return output


def t5_summarize(input_text, num_beams=4, num_words=50):
    input_text = str(input_text).replace('\n', '')
    def test_summarization(self):
        model = T5ForConditionalGeneration.from_pretrained("t5-base").to(
            torch_device)
        tok = T5Tokenizer.from_pretrained("t5-base")

        FRANCE_ARTICLE = 'Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.'  # @noqa
        EXPECTED_SUMMARY_FRANCE = 'french prosecutor says he is not aware of any video footage from on board the plane . prosecutor: "so far no videos were used in the crash investigation" two magazines claim to have found a cell phone video of the final seconds of flight 9525 . all 150 on board were killed when the plane crashed into the french Alps .'

        SHORTER_ARTICLE = '(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'
        EXPECTED_SUMMARY_SHORTER = "the formal accession was marked with a ceremony at The Hague, in the Netherlands . the Palestinians signed the ICC's founding Rome Statute in January . they also accepted its jurisdiction over alleged crimes committed in occupied Palestinian territory . as members, Palestinians may be subject to counter-charges as well ."

        IRAN_ARTICLE = "(CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger. Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a letter to the Iranian leadership warning them away from a deal. The debate that has already begun since the announcement of the new framework will likely result in more heat than light. It will not be helped by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: . The most misleading assertion, despite universal rejection by experts, is that the negotiations' objective at the outset was the total elimination of any nuclear program in Iran. That is the position of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it had been, there would have been no Iranian team at the negotiating table. Rather, the objective has always been to structure an agreement or series of agreements so that Iran could not covertly develop a nuclear arsenal before the United States and its allies could respond. The new framework has exceeded expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite sharp accusations by some in the United States and its allies, Iran denies having such a program, and U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's continued cooperation with International Atomic Energy Agency inspections is further evidence on this point, and we'll know even more about Iran's program in the coming months and years because of the deal. In fact, the inspections provisions that are part of this agreement are designed to protect against any covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter warning that a deal might be killed by Congress or a future president). This of course is not the case. The talks were between Iran and the five permanent members of the U.N. Security Council (United States, United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the agreement should be a formal treaty requiring the Senate to \"advise and consent.\" But the issue is not suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally some insist that any agreement must address Iranian missile programs, human rights violations or support for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in the negotiations would be a poison pill. This agreement should be judged on its merits and on how it affects the security of our negotiating partners and allies, including Israel. Those judgments should be fact-based, not based on questionable assertions or dubious assumptions."
        EXPECTED_SUMMARY_IRAN = "the united states and its negotiating partners reached a very strong framework agreement with Iran . the agreement limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon . expect pushback anyway, if the recent past is any harbinger ."

        ARTICLE_SUBWAY = 'New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.  Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.'
        EXPECTED_SUMMARY_SUBWAY = "in total, barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002 . she is believed to still be married to four men, and at one time, she was married to eight men at once . prosecutors say the marriages were part of an immigration scam ."

        task_specific_config = getattr(model.config, "task_specific_params",
                                       {})
        summarization_config = task_specific_config.get("summarization", {})
        model.config.update(summarization_config)

        dct = tok(
            [
                model.config.prefix + x for x in [
                    FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE,
                    ARTICLE_SUBWAY
                ]
            ],
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        self.assertEqual(512, dct["input_ids"].shape[1])

        hypotheses_batch = model.generate(
            input_ids=dct["input_ids"].to(torch_device),
            attention_mask=dct["attention_mask"].to(torch_device),
            num_beams=4,
            length_penalty=2.0,
            max_length=142,
            min_length=56,
            no_repeat_ngram_size=3,
            do_sample=False,
            early_stopping=True,
        )

        decoded = [
            tok.decode(g,
                       skip_special_tokens=True,
                       clean_up_tokenization_spaces=False)
            for g in hypotheses_batch
        ]

        self.assertListEqual(
            [
                EXPECTED_SUMMARY_FRANCE, EXPECTED_SUMMARY_SHORTER,
                EXPECTED_SUMMARY_IRAN, EXPECTED_SUMMARY_SUBWAY
            ],
            decoded,
        )
示例#4
0
import nlp
from transformers import T5ForConditionalGeneration, T5Tokenizer

from tqdm.auto import tqdm

# model = T5ForConditionalGeneration.from_pretrained('models/tpu').to('cpu') # because its loaded on xla by default
# tokenizer = T5Tokenizer.from_pretrained('models/tpu')

model_args, = parser.parse_json_file(json_file=os.path.abspath('args.json'))
print(model_args)
print(model_args.model_name_or_path)

# BEN
# model = T5ForConditionalGeneration.from_pretrained('models/tpu').to('cpu') # because its loaded on xla by default
# tokenizer = T5Tokenizer.from_pretrained('models/tpu')
model = T5ForConditionalGeneration.from_pretrained(
    model_args.model_name_or_path)
tokenizer = T5Tokenizer.from_pretrained(model_args.model_name_or_path)

valid_dataset = torch.load('valid_data.pt')
dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=2)

answers = []
i = 0
for batch in tqdm(dataloader):
    i += 1
    outs = model.generate(input_ids=batch['input_ids'],
                          attention_mask=batch['attention_mask'],
                          max_length=16,
                          early_stopping=True)
    outs = [tokenizer.decode(ids) for ids in outs]
    answers.extend(outs)
示例#5
0
paragraphs = [
    "Python is an interpreted, high-level and general-purpose programming language. Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects.",
    "Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly, procedural), object-oriented and functional programming. Python is often described as a \"batteries included\" language due to its comprehensive standard library.",
    "Python was created in the late 1980s, and first released in 1991, by Guido van Rossum as a successor to the ABC programming language. Python 2.0, released in 2000, introduced new features, such as list comprehensions, and a garbage collection system with reference counting, and was discontinued with version 2.7 in 2020. Python 3.0, released in 2008, was a major revision of the language that is not completely backward-compatible and much Python 2 code does not run unmodified on Python 3. With Python 2's end-of-life (and pip having dropped support in 2021), only Python 3.6.x and later are supported, with older versions still supporting e.g. Windows 7 (and old installers not restricted to 64-bit Windows).",
    "Python interpreters are supported for mainstream operating systems and available for a few more (and in the past supported many more). A global community of programmers develops and maintains CPython, a free and open-source reference implementation. A non-profit organization, the Python Software Foundation, manages and directs resources for Python and CPython development.",
    "As of January 2021, Python ranks third in TIOBE’s index of most popular programming languages, behind C and Java, having previously gained second place and their award for the most popularity gain for 2020.",
    "Java is a class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let application developers write once, run anywhere (WORA), meaning that compiled Java code can run on all platforms that support Java without the need for recompilation. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019, Java was one of the most popular programming languages in use according to GitHub, particularly for client-server web applications, with a reported 9 million developers.",
    "Java was originally developed by James Gosling at Sun Microsystems (which has since been acquired by Oracle) and released in 1995 as a core component of Sun Microsystems' Java platform. The original and reference implementation Java compilers, virtual machines, and class libraries were originally released by Sun under proprietary licenses. As of May 2007, in compliance with the specifications of the Java Community Process, Sun had relicensed most of its Java technologies under the GNU General Public License. Oracle offers its own HotSpot Java Virtual Machine, however the official reference implementation is the OpenJDK JVM which is free open source software and used by most developers and is the default JVM for almost all Linux distributions.",
    "As of September 2020, the latest version is Java 15, with Java 11, a currently supported long-term support (LTS) version, released on September 25, 2018. Oracle released the last zero-cost public update for the legacy version Java 8 LTS in January 2019 for commercial use, although it will otherwise still support Java 8 with public updates for personal use indefinitely. Other vendors have begun to offer zero-cost builds of OpenJDK 8 and 11 that are still receiving security and other upgrades.",
    "Oracle (and others) highly recommend uninstalling outdated versions of Java because of serious risks due to unresolved security issues. Since Java 9, 10, 12, 13, and 14 are no longer supported, Oracle advises its users to immediately transition to the latest version (currently Java 15) or an LTS release."
]

# For available models for query generation, see: https://huggingface.co/BeIR/
# Here, we use a T5-large model was trained on the MS MARCO dataset
tokenizer = T5Tokenizer.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')
model = T5ForConditionalGeneration.from_pretrained(
    'BeIR/query-gen-msmarco-t5-large-v1')
model.eval()

#Select the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

#Iterate over the paragraphs and generate for each some queries
with torch.no_grad():
    for para in paragraphs:
        input_ids = tokenizer.encode(para, return_tensors='pt').to(device)
        outputs = model.generate(input_ids=input_ids,
                                 max_length=64,
                                 do_sample=True,
                                 top_p=0.95,
                                 num_return_sequences=3)
def run_experiment(args):
    args.run_id = str(ex.current_run._id)

    tokenizer = T5Tokenizer.from_pretrained(args.transformer_model)
    #Load datasets
    ## Conversation Response Ranking
    if args.task in ["mantis", "msdialog", "ubuntu_dstc8"]:
        add_turn_separator = (
            args.task != "ubuntu_dstc8"
        )  # Ubuntu data has several utterances from same user in the context.
        train = preprocess_crr.read_crr_tsv_as_df(
            args.data_folder + args.task + "/train.tsv", args.sample_data,
            add_turn_separator)
        valid = preprocess_crr.read_crr_tsv_as_df(
            args.data_folder + args.task + "/valid.tsv", args.sample_data,
            add_turn_separator)
        special_tokens_dict = {
            'additional_special_tokens': ['[UTTERANCE_SEP]', '[TURN_SEP]']
        }
        tokenizer.add_special_tokens(special_tokens_dict)
    ## Similar Question Retrieval and Passage Retrieval
    elif args.task in ["qqp", "linkso", "trec2020pr"]:
        if args.sample_data == -1: args.sample_data = None
        train = pd.read_csv(args.data_folder + args.task + "/train.tsv",
                            sep="\t",
                            nrows=args.sample_data)
        valid = pd.read_csv(args.data_folder + args.task + "/valid.tsv",
                            sep="\t",
                            nrows=args.sample_data)

    #Choose the negative candidate sampler
    document_col = train.columns[1]
    if args.train_negative_sampler == 'random':
        ns_train = negative_sampling.RandomNegativeSampler(
            list(train[document_col].values), args.num_ns_train)
    elif args.train_negative_sampler == 'bm25':
        ns_train = negative_sampling.BM25NegativeSamplerPyserini(
            list(train[document_col].values), args.num_ns_train,
            args.data_folder + args.task + "/anserini_train/",
            args.sample_data, args.anserini_folder)
    elif args.train_negative_sampler == 'sentenceBERT':
        ns_train = negative_sampling.SentenceBERTNegativeSampler(
            list(train[document_col].values), args.num_ns_train,
            args.data_folder + args.task + "/train_sentenceBERTembeds",
            args.sample_data, args.bert_sentence_model)

    if args.test_negative_sampler == 'random':
        ns_val = negative_sampling.RandomNegativeSampler(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval)
    elif args.test_negative_sampler == 'bm25':
        ns_val = negative_sampling.BM25NegativeSamplerPyserini(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval,
            args.data_folder + args.task + "/anserini_valid/",
            args.sample_data, args.anserini_folder)
    elif args.test_negative_sampler == 'sentenceBERT':
        ns_val = negative_sampling.SentenceBERTNegativeSampler(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval,
            args.data_folder + args.task + "/valid_sentenceBERTembeds",
            args.sample_data, args.bert_sentence_model)

    #Create the loaders for the datasets, with the respective negative samplers
    dataloader = dataset.QueryDocumentDataLoader(
        train, valid, valid, tokenizer, ns_train, ns_val, 'generation',
        args.train_batch_size, args.val_batch_size, args.max_seq_len,
        args.sample_data, args.data_folder + args.task)

    train_loader, val_loader, test_loader = dataloader.get_pytorch_dataloaders(
    )

    #Instantiate transformer model to be used
    model = T5ForConditionalGeneration.from_pretrained(args.transformer_model)
    model.resize_token_embeddings(len(dataloader.tokenizer))

    #Instantiate trainer that handles fitting.
    trainer = transformer_trainer.TransformerTrainer(
        model, train_loader, val_loader, test_loader, args.num_ns_eval,
        "generation", tokenizer, args.validate_every_epochs,
        args.num_validation_instances, args.num_epochs, args.lr,
        args.sacred_ex)

    #Train
    model_name = model.__class__.__name__
    logging.info("Fitting {} for {}{}".format(model_name, args.data_folder,
                                              args.task))
    trainer.fit()

    #Predict for test
    logging.info("Predicting")
    preds, labels = trainer.test()
    res = results_analyses_tools.evaluate_and_aggregate(
        preds, labels, ['R_10@1'])
    for metric, v in res.items():
        logging.info("Test {} : {:4f}".format(metric, v))

    #Saving predictions and labels to a file
    max_preds_column = max([len(l) for l in preds])
    preds_df = pd.DataFrame(
        preds,
        columns=["prediction_" + str(i) for i in range(max_preds_column)])
    preds_df.to_csv(args.output_dir + "/" + args.run_id + "/predictions.csv",
                    index=False)

    labels_df = pd.DataFrame(
        labels, columns=["label_" + str(i) for i in range(max_preds_column)])
    labels_df.to_csv(args.output_dir + "/" + args.run_id + "/labels.csv",
                     index=False)

    #Saving model to a file
    if args.save_model:
        torch.save(model.state_dict(),
                   args.output_dir + "/" + args.run_id + "/model")

    #In case we want to get uncertainty estimations at prediction time
    if args.predict_with_uncertainty_estimation:
        logging.info("Predicting with dropout.")
        preds, uncertainties, labels, foward_passes_preds = trainer.test_with_dropout(
            args.num_foward_prediction_passes)
        res = results_analyses_tools.evaluate_and_aggregate(
            preds, labels, ['R_10@1'])
        for metric, v in res.items():
            logging.info(
                "Test (w. dropout and {} foward passes) {} : {:4f}".format(
                    args.num_foward_prediction_passes, metric, v))

        max_preds_column = max([len(l) for l in preds])
        preds_df = pd.DataFrame(
            preds,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        preds_df.to_csv(args.output_dir + "/" + args.run_id +
                        "/predictions_with_dropout.csv",
                        index=False)

        for i, f_pass_preds in enumerate(foward_passes_preds):
            preds_df = pd.DataFrame(f_pass_preds,
                                    columns=[
                                        "prediction_" + str(i)
                                        for i in range(max_preds_column)
                                    ])
            preds_df.to_csv(
                args.output_dir + "/" + args.run_id +
                "/predictions_with_dropout_f_pass_{}.csv".format(i),
                index=False)

        labels_df = pd.DataFrame(
            labels,
            columns=["label_" + str(i) for i in range(max_preds_column)])
        labels_df.to_csv(args.output_dir + "/" + args.run_id + "/labels.csv",
                         index=False)

        uncertainties_df = pd.DataFrame(
            uncertainties,
            columns=["uncertainty_" + str(i) for i in range(max_preds_column)])
        uncertainties_df.to_csv(args.output_dir + "/" + args.run_id +
                                "/uncertainties.csv",
                                index=False)

    return trainer.best_ndcg
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        self.hparams = hparams

        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
示例#8
0
missionaries canonized as saints in the Roman Catholic Church in 1930."""

questions = [
    "When Brébeuf was beatified?",  # T5 Answer:  <pad> 1 9 2 5 </s>
    "When Brébeuf was canonized?",  # T5 Answer:  <pad> 1 9 3 0 </s>
    "With how many missionaries was canonized?",  # T5 Answer:  <pad> 8 </s>
    "With how many missionaries was Brébeuf canonized?",  # T5 Answer:  <pad> 1 7 </s>
    "How many missionaries were canonized?",  # T5 Answer:  <pad> 8 </s>
    "How many missionaries were canonized as saints?",  # T5 Answer:  <pad> 1 7 </s>
]

tokenizer = T5Tokenizer.from_pretrained("nielsr/nt5-small-rc1",
                                        cache_dir=os.getenv(
                                            "cache_dir", "../../models"))
model = T5ForConditionalGeneration.from_pretrained("nielsr/nt5-small-rc1",
                                                   cache_dir=os.getenv(
                                                       "cache_dir",
                                                       "../../models"))

for question in questions:
    # encode context & question
    input_text = f"answer_me: {question} context: {context}"
    encoded_query = tokenizer(input_text,
                              return_tensors='pt',
                              padding='max_length',
                              truncation=True,
                              max_length=512)

    # generate answer
    generated_answer = model.generate(
        input_ids=encoded_query["input_ids"],
        attention_mask=encoded_query["attention_mask"],
def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


set_seed(42)

#In particular here, your model file should be named pytorch_model.bin

# Model 2
# model = T5ForConditionalGeneration.from_pretrained('prakhar_t5_base/') # Model 2 with training set splitted in 80:20 ratio for train/val and run for 2 epochs
# Model 3
model = T5ForConditionalGeneration.from_pretrained(
    'prakhar_t5_model5/'
)  # Model 5 with training set splitted in 95:5 ratio for train/val and run for 3 epochs

tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_paraphraser')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device ", device)
model = model.to(device)


def paraphrase(sentence):
    text = "paraphrase: " + sentence + " </s>"
    max_len = 100
    encoding = tokenizer.encode_plus(text,
                                     pad_to_max_length=True,
                                     return_tensors="pt")
      precision=16 if args.fp_16 else 32,
      amp_level=args.opt_level,
      gradient_clip_val=args.max_grad_norm,
      checkpoint_callback=checkpoint_callback,
      callbacks=[LoggingCallback()],
  )

  def get_dataset(tokenizer, type_path, args):
    return MyDataset(tokenizer=tokenizer,
                     data_dir=args.data_dir,
                     type_path=type_path,
                     max_len=args.max_seq_length)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  print(device)
  model = T5ForConditionalGeneration.from_pretrained(paths[1]).to(device)

  model = T5FineTuner(args)
  trainer = pl.Trainer(**train_params)
  trainer.fit(model)
  model.model.save_pretrained('t5_base_10pctconll_ner_old')
  """
  # ==================== Eval on Train
  print('\n\n==================== Eval on Train ====================\n\n')

  # dataset = ImdbDataset(tokenizer, 'aclImdb', 'test',  max_len=512)
  dataset = MyDataset(tokenizer,
                      './data/processed_full_conll_LabelOnly/',
                      'train',
                      max_len=64)
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from typing import Union, List

# Code from https://towardsdatascience.com/paraphrase-any-question-with-t5-text-to-text-transfer-transformer-pretrained-model-and-cbb9e35f1555 # noqa: E501


model = T5ForConditionalGeneration.from_pretrained("ramsrigouthamg/t5_paraphraser")
tokenizer = T5Tokenizer.from_pretrained("ramsrigouthamg/t5_paraphraser")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device ", device)
model = model.to(device)
max_len = 256


def generate(
    sentence: str,
    verbose: int = 0,
    skip_special_tokens: bool = True,
    clean_up_tokenization_spaces: bool = True,
) -> List[str]:
    text = "paraphrase: " + sentence + " </s>"

    encoding = tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = (
        encoding["input_ids"].to(device),
        encoding["attention_mask"].to(device),
    )

    # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
        accumulate_grad_batches=args.gradient_accumulation_steps,
        gpus=args.n_gpu,  #distributed_backend=args.distributed_backend,
        max_epochs=args.num_train_epochs,  # early_stop_callback=False,
        amp_level=args.opt_level,
        gradient_clip_val=args.max_grad_norm,
        checkpoint_callback=checkpoint_callback,
        callbacks=[early_stop_callback],
        enable_pl_optimizer=True)

    model = LitT5Finetuner(args, train_data, test_data)
    trainer = pl.Trainer(**train_params)
    trainer.fit(model)

    model.model.save_pretrained(args.save_dir)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = T5ForConditionalGeneration.from_pretrained(
        args.save_dir).to(device)

    print(
        '\n\n==================== Eval on Train One Batch ====================\n\n'
    )
    train_data_loader = DataLoader(train_data,
                                   batch_size=args.train_batch_size,
                                   shuffle=True)
    it = iter(train_data_loader)
    batch = next(it)
    outs = model.generate(input_ids=batch['source_ids'].cuda(),
                          attention_mask=batch['source_mask'].cuda())
    dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
    texts = [
        tokenizer.decode(ids, skip_special_tokens=True)
        for ids in batch['source_ids']
示例#13
0
 def __init__(self, model=None, tokenizer=None) -> None:
     super().__init__()
     self.model = model or T5ForConditionalGeneration.from_pretrained(
         "t5-small")
     self.tokenizer = tokenizer or T5Tokenizer.from_pretrained(
         "t5-small", additional_special_tokens=[USR_END_TKN], extra_ids=0)
示例#14
0
"""
import os
import torch
from nltk.tokenize import sent_tokenize, word_tokenize

try:
    import transformers
    from transformers import T5Tokenizer, T5ForConditionalGeneration
except ImportError:
    raise ImportError(INSTALL_MSG)

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained(
    "C:\\Users\\mmiin\\text-summarizer\\TextSumm\\Summarizer\\ML")


def predict(text, num_beam):
    number_of_sentences = sent_tokenize(text)
    print('number_of_sentences: ', len(number_of_sentences))
    number_of_words = word_tokenize(text)
    print('number_of_words: ', len(number_of_words))
    if (len(number_of_sentences) <= 4 and len(number_of_words) <= 40):
        summary_txt = text
    else:
        input_ids = tokenizer.batch_encode_plus(
            [text], truncation=True, max_length=1024,
            return_tensors='pt')['input_ids'].to(torch_device)
        summary_ids = model.generate(input_ids,
                                     num_beams=num_beam,
示例#15
0
    def __init__(self, hparams):
        super().__init__()

        self.hparams = hparams
        if self.hparams.vocab_name == "custom":
            self.tokenizer = get_custom_vocab()
        else:
            self.tokenizer = T5Tokenizer.from_pretrained(
                self.hparams.vocab_name)

        if "small" in self.hparams.model_name.split('-'):
            self.size = "small"
        elif "base" in self.hparams.model_name.split('-'):
            self.size = "base"
        elif "large" in self.hparams.model_name.split('-'):
            self.size = "large"
        else:
            raise ValueError("Couldn't detect model size from model_name.")

        if self.hparams.model_name[:2] == "pt":
            logging.info("Initializing from PTT5 checkpoint...")
            config, state_dict = self.get_ptt5()
            if self.hparams.architecture == "gen" or self.hparams.architecture == "categoric_gen":
                self.t5 = T5ForConditionalGeneration.from_pretrained(
                    pretrained_model_name_or_path=None,
                    config=config,
                    state_dict=state_dict)
            else:
                self.t5 = T5Model.from_pretrained(
                    pretrained_model_name_or_path=None,
                    config=config,
                    state_dict=state_dict)
        else:
            logging.info("Initializing from T5 checkpoint...")
            if self.hparams.architecture == "gen" or self.hparams.architecture == "categoric_gen":
                self.t5 = T5ForConditionalGeneration.from_pretrained(
                    self.hparams.model_name)
            else:
                self.t5 = T5Model.from_pretrained(self.hparams.model_name)

        D = self.t5.config.d_model

        if self.hparams.architecture == "mlp":
            # Replace T5 with a simple nonlinear input
            self.t5 = NONLinearInput(self.hparams.seq_len, D)

        if self.hparams.architecture != "gen" and self.hparams.architecture != "categoric_gen":
            if self.hparams.architecture == "categoric":
                assert self.hparams.nout != 1, "Categoric mode with 1 nout doesn't work with CrossEntropyLoss"
                self.linear = nn.Linear(D, self.hparams.nout)
            else:
                self.linear = nn.Linear(D, 1)

        if self.hparams.architecture == "categoric" or self.hparams.architecture == "categoric_gen":
            self.loss = nn.CrossEntropyLoss()
        else:
            self.loss = nn.MSELoss()

        self.pearson_calculator = PearsonCalculator()

        logging.info("Initialization done.")
示例#16
0
 def __init__(self, model='paulowoicho/t5-podcast-summarisation'):
     self.tokenizer = T5Tokenizer.from_pretrained(model)
     self.model = T5ForConditionalGeneration.from_pretrained(model)
示例#17
0
def unifiedqa_model_loader(model_name: str):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    return model, tokenizer
示例#18
0
    encoded = dataset.map(convert_to_features, batched=True)
    columns = [
        'input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'
    ]
    encoded.set_format(type='torch', columns=columns)

    train_dataloader = torch.utils.data.DataLoader(encoded["train"],
                                                   collate_fn=collate_fn,
                                                   batch_size=args.batch_size)
    val_dataloader = torch.utils.data.DataLoader(encoded["validation"],
                                                 collate_fn=collate_fn,
                                                 batch_size=args.batch_size *
                                                 4)

    if args.from_pretrained:
        model = T5ForConditionalGeneration.from_pretrained(args.model_select)
    else:
        config = T5Config.from_pretrained(args.model_select)
        model = T5ForConditionalGeneration(config)

    no_decay = ["bias", "LayerNorm.weight"]
    params_decay = [
        p for n, p in model.named_parameters()
        if not any(nd in n for nd in no_decay)
    ]
    params_nodecay = [
        p for n, p in model.named_parameters()
        if any(nd in n for nd in no_decay)
    ]
    optim_groups = [
        {
示例#19
0
from transformers import T5ForConditionalGeneration, T5Tokenizer
import os
import pandas as pd

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# initialize the model architecture and weights
model = T5ForConditionalGeneration.from_pretrained("t5-base")
# initialize the model tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")

imdb_data = r"D:\ruin\data\IMDB Dataset2.csv"
df_imdb = pd.read_csv(imdb_data)

original_text = df_imdb['text'][10000]

# encode the text into tensor of integers using the appropriate tokenizer
inputs = tokenizer.encode(original_text,
                          return_tensors="pt",
                          max_length=512,
                          truncation=True)

# generate the summarization output
outputs = model.generate(inputs,
                         max_length=150,
                         min_length=40,
                         length_penalty=2.0,
                         num_beams=4,
                         early_stopping=True)
# just for debugging
示例#20
0
'shuffle': False,
'num_workers': 0
}

# -

training_set.summ_len

# +
training_loader = DataLoader(training_set, **train_params, drop_last=True)
val_loader = DataLoader(val_set, **val_params, drop_last=True)
test_loader = DataLoader(test_set, **val_params, drop_last=True)
val_loader_mini = DataLoader(val_set_mini, **val_params, drop_last=True)

logging.info("Loading model from {}".format(model_name))
model = T5ForConditionalGeneration.from_pretrained(model_name, return_dict=True)
logging.info("Move model to device {}".format(device))
model = model.to(device)
model.resize_token_embeddings(len(tokenizer))

optimizer = torch.optim.Adam(params=model.parameters(), lr=config.LEARNING_RATE)
#optimizer = Adafactor(model.parameters(),lr=1e-3,
#                      eps=(1e-30, 1e-3),
#                      clip_threshold=1.0,
#                      decay_rate=-0.8,
#                      beta1=None,
#                      weight_decay=0.0,
#                      relative_step=False,
#                      scale_parameter=False,
#                      warmup_init=False)
wandb.watch(model, log="all")
示例#21
0
    def __init__(
        self,
        model_name,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):

        """
        Initializes a T5Model model.

        Args:
            model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
        """  # noqa: ignore flake8"

        self.args = self._load_model_args(model_name)

        if isinstance(args, dict):
            self.args.update_from_dict(args)
        elif isinstance(args, T5Args):
            self.args = args

        if "sweep_config" in kwargs:
            sweep_config = kwargs.pop("sweep_config")
            sweep_values = {
                key: value["value"]
                for key, value in sweep_config.as_dict().items()
                if key != "_wandb"
            }
            self.args.update_from_dict(sweep_values)

        if self.args.manual_seed:
            random.seed(self.args.manual_seed)
            np.random.seed(self.args.manual_seed)
            torch.manual_seed(self.args.manual_seed)
            if self.args.n_gpu > 0:
                torch.cuda.manual_seed_all(self.args.manual_seed)

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    "Make sure CUDA is available or set `use_cuda=False`.")
        else:
            self.device = "cpu"

        self.results = {}

        self.config = T5Config.from_pretrained(model_name, **self.args.config)

        self.model = T5ForConditionalGeneration.from_pretrained(
            model_name, config=self.config)

        self.tokenizer = T5Tokenizer.from_pretrained(model_name, truncate=True)

        if self.args.dynamic_quantize:
            self.model = torch.quantization.quantize_dynamic(self.model,
                                                             {torch.nn.Linear},
                                                             dtype=torch.qint8)

        if not use_cuda:
            self.args.fp16 = False

        self.args.model_type = "T5"
        self.args.model_name = model_name

        if self.args.wandb_project and not wandb_available:
            warnings.warn(
                "wandb_project specified but wandb is not available. Wandb disabled."
            )
            self.args.wandb_project = None
示例#22
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path != "new":
        model = T5ForConditionalGeneration.from_pretrained(
            model_args.model_name_or_path, )
    else:
        config = AutoConfig.from_pretrained("t5-small")
        model = T5ForConditionalGeneration(config=config)

    model.resize_token_embeddings(len(tokenizer))

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets

    train_dataset = get_dataset(
        data_args, tokenizer=tokenizer) if training_args.do_train else None
    eval_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        evaluate=True) if training_args.do_eval else None
    data_collator = DoNothingDataCollator()

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        # trainer.train(model_path=model_path)
        trainer.train()
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        model.eval()
        data_collator = DoNothingDataCollatorForGeneration()
        sampler = SequentialSampler(eval_dataset)
        data_loader = DataLoader(
            eval_dataset,
            sampler=sampler,
            batch_size=training_args.eval_batch_size,
            collate_fn=data_collator.collate_batch,
        )
        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_lm.txt")
        writer = open(output_eval_file, "w")
        for inputs in tqdm(data_loader, "Prediction"):
            for k, v in inputs.items():
                inputs[k] = v.cuda()

            with torch.no_grad():
                outputs = model.generate(
                    input_ids=inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],
                    max_length=12)
                dec = [tokenizer.decode(ids) for ids in outputs]

                for i in range(0, len(dec)):
                    writer.write(dec[i] + "\n")

    return results
示例#23
0
import torch
import json
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cpu')


def summarize_txt(text):
    preprocess_text = text.strip().replace("\n", "")
    t5_prepared_Text = "summarize: " + preprocess_text
    print("original text preprocessed: \n", preprocess_text)
    tokenized_text = tokenizer.encode(t5_prepared_Text,
                                      return_tensors="pt").to(device)
    summary_ids = model.generate(tokenized_text,
                                 num_beams=4,
                                 no_repeat_ngram_size=2,
                                 min_length=30,
                                 max_length=100,
                                 early_stopping=True)
    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return output
def transcribe():
    audio = AudioSegment.from_file("file.webm")
    audio.export("transcribe_sound.mp3", format="mp3")

    clipnum = split("transcribe_sound.mp3")
    print(clipnum)
    # convert mp3 file to wav

    transcription = ""
    i = 0
    while i < (clipnum + 1):

        transcription += " " + transcribe_file("chunks/chunk" + str(i) +
                                               ".wav")
        i += 1

    print(transcription)

    lines = textwrap.wrap(transcription, 2000, break_long_words=False)

    z = 0

    lineslen = len(lines)

    model = T5ForConditionalGeneration.from_pretrained('t5-base')
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    device = torch.device('cpu')

    tokenizer2 = AutoTokenizer.from_pretrained("valhalla/t5-base-e2e-qg")
    model2 = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-e2e-qg")

    questions = []

    while z < lineslen:
        text = lines[z]
        preprocess_text = text.strip().replace("\n", "")
        t5_prepared_Text = "summarize: " + preprocess_text

        tokenized_text = tokenizer.encode(t5_prepared_Text,
                                          return_tensors="pt").to(device)
        # summmarize
        summary_ids = model.generate(tokenized_text,
                                     num_beams=4,
                                     no_repeat_ngram_size=2,
                                     min_length=70,
                                     max_length=100,
                                     early_stopping=True)

        output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        print("\n\nSummarized text number " + str((z + 1)) + ": \n", output)

        #Question Generation
        tokenized_text2 = tokenizer2.encode(output,
                                            return_tensors="pt").to(device)
        # summmarize
        summary_ids2 = model2.generate(
            tokenized_text2,
            num_beams=4,
            max_decoding_length=64,
        )

        output2 = tokenizer2.decode(summary_ids2[0], skip_special_tokens=True)

        print("\n\nGenerated Question " + str((z + 1)) + ": \n", output2)
        asdf = output2.split("<sep>")
        for question in asdf:
            questions.append(question)
        z += 1
    return questions
    def __init__(
        self,
        model_name,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):

        """
        Initializes a T5Model model.

        Args:
            model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
        """  # noqa: ignore flake8"

        if args and "manual_seed" in args:
            random.seed(args["manual_seed"])
            np.random.seed(args["manual_seed"])
            torch.manual_seed(args["manual_seed"])
            if "n_gpu" in args and args["n_gpu"] > 0:
                torch.cuda.manual_seed_all(args["manual_seed"])

        self.args = {
            "dataset_class": None,
            "do_sample": False,
            "max_steps": -1,
            "evaluate_generated_text": False,
            "num_beams": 1,
            "max_length": 20,
            "repetition_penalty": 1.0,
            "length_penalty": 2.0,
            "early_stopping": True,
            "preprocess_inputs": True,
        }

        self.args.update(global_args)

        if args:
            self.args.update(args)

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    "Make sure CUDA is available or set `use_cuda=False`.")
        else:
            self.device = "cpu"

        self.results = {}

        self.config = T5Config.from_pretrained(model_name,
                                               **self.args["config"])

        self.model = T5ForConditionalGeneration.from_pretrained(
            model_name, config=self.config)

        self.tokenizer = T5Tokenizer.from_pretrained(model_name)

        if not use_cuda:
            self.args["fp16"] = False

        self.args["model_name"] = model_name

        if self.args["wandb_project"] and not wandb_available:
            warnings.warn(
                "wandb_project specified but wandb is not available. Wandb disabled."
            )
            self.args["wandb_project"] = None
示例#26
0
文件: t5.py 项目: khanhgithead/ParlAI
def build_t5(opt: Opt) -> T5ForConditionalGeneration:
    if not check_hf_version(HF_VERSION):
        raise RuntimeError('Must use transformers package >= 4.3 to use t5')
    return T5ForConditionalGeneration.from_pretrained(
        opt['t5_model_arch'], dropout_rate=opt['t5_dropout'])
示例#27
0
def main(args):
    parser = HfArgumentParser((EvalArguments, ))

    # Read command-line arguments if present, else read arguments from json file
    if len(args) >= 2:
        args = parser.parse_args_into_dataclasses(args=args)[0]
    else:
        args = parser.parse_json_file(
            json_file=os.path.abspath('eval_args.json'))[0]

    # Initiliaze the tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained(
        args.tokenizer_name_or_path if args.tokenizer_name_or_path else args.
        model_name_or_path)
    model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)

    device = 'cuda' if torch.cuda.is_available else 'cpu'

    # Load the dataset
    dataset = torch.load(args.file_path)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=16)

    # Evaluate accuracy on the dev set
    if args.do_eval:
        predictions = []
        targets = []
        model.to(device)

        model.eval()
        with torch.no_grad():
            for batch in tqdm(dataloader):
                prediction = model.generate(
                    input_ids=batch['input_ids'].to(device),
                    attention_mask=batch['attention_mask'].to(device),
                    max_length=args.max_target_length)

                prediction = [tokenizer.decode(ids) for ids in prediction]
                target = [tokenizer.decode(ids) for ids in batch['target_ids']]

                predictions.extend(prediction)
                targets.extend(target)

        accuracy = metrics.accuracy_score(targets, predictions)
        output_file = os.path.join(args.model_name_or_path,
                                   'eval_accuracy.txt')
        with open(output_file, 'w') as writer:
            writer.write('Accuracy = %f\n' % accuracy)

    # Generate predictions for provided file
    if args.do_predict:
        predictions = []
        model.to(device)

        model.eval()
        with torch.no_grad():
            for batch in tqdm(dataloader):
                prediction = model.generate(
                    input_ids=batch['input_ids'].to(device),
                    attention_mask=batch['attention_mask'].to(device),
                    max_length=args.max_target_length,
                    num_beams=args.num_beams,
                    early_stopping=args.early_stopping,
                    no_repeat_ngram_size=args.no_repeat_ngram_size,
                    length_penalty=args.length_penalty)

                prediction = [
                    tokenizer.decode(ids,
                                     skip_special_tokens=True,
                                     clean_up_tokenization_spaces=True)
                    for ids in prediction
                ]
                predictions.extend(prediction)

        output_file = os.path.join(args.model_name_or_path, 'predictions.txt')
        with open(output_file, 'w') as writer:
            writer.write('\n'.join(predictions))
示例#28
0
                    help='tag for training data',
                    type=str)
parser.add_argument('--train_data_path',
                    required=True,
                    help='training data path')
parser.add_argument('--output_path',
                    required=True,
                    help='output directory path')
parser.add_argument('--epoch', default=1, type=int)
parser.add_argument('--batch_size', default=4, type=int)
parser.add_argument('--weight_decay', default=5e-5, type=float)
parser.add_argument('--lr', default=3e-4, type=float)
parser.add_argument('--gra_acc_steps', default=8, type=int)
args = parser.parse_args()

model = T5ForConditionalGeneration.from_pretrained(args.pretrained_model_path)
train_dataset = TrainerDataset(args.train_data_path)

training_args = TrainingArguments(
    output_dir=args.output_path,
    num_train_epochs=args.epoch,
    per_device_train_batch_size=args.batch_size,
    weight_decay=args.weight_decay,
    learning_rate=args.lr,
    gradient_accumulation_steps=args.gra_acc_steps,
    logging_dir='./logs',
)

trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)

trainer.train()
 def model(self):
     return T5ForConditionalGeneration.from_pretrained("t5-base").to(
         torch_device)
示例#30
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--query_field",
        default=None,
        type=str,
        required=True,
        help="Query fields to use as input. Supported values are title, desc, title_desc",
    )
    parser.add_argument(
        "--train_test_file",
        default=None,
        type=str,
        required=True,
        help="trec_json file path that contains train/test data",
    )
    parser.add_argument(
        "--qrel_file", default=None, type=str, help="Relevance judgment file"
    )
    parser.add_argument(
        "--query_file", default=None, type=str, help="File that contains queries"
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default=None,
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help="The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the test set.")
    parser.add_argument(
        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
    )
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )
    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    args.task_name="podcast"
    processor = PodcastProcessor(query_field=args.query_field)
    args.output_mode = "classification"
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    config = AutoConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir,
        use_cache=True
    )
    args.model_type = config.model_type
    tokenizer = AutoTokenizer.from_pretrained(
        't5-base',
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir,
    )
    model = T5ForConditionalGeneration.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir,
    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = T5ForConditionalGeneration.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.model_name_or_path]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = T5ForConditionalGeneration.from_pretrained(checkpoint)
            model.to(args.device)
            evaluate(args, model, tokenizer, prefix=prefix)